RzZ ggerganov commited on
Commit
219d12b
·
1 Parent(s): 83a0899

llama : add Qwen2VL support + multimodal RoPE (llama/10361)

Browse files

* Barebone Qwen2VL LLM convertor

* Add Qwen2VL cli entrypoint

* [WIP] add qwen2vl arch

* Verify m-rope output

* Add vl-rope/2d-rope support for qwen2vl ViT

* update qwen2vl cli tool

* update 5D tensor op workaround

* [WIP] qwen2vl vision model

* make batch and clip utils compatible with qwen2vl

* [WIP] create inference workflow, gguf convert script but fix

* correcting vision-rope behavior, add the missing last layer back to ViT

* add arg parser to qwen2vl_surgery

* replace variable size array with vector

* cuda-gdb cmake preset

* add fp32 mrope, vision rope kernel

* add fp16 support for qwen2vl and m-rope

* add `GGML_ROPE_TYPE_MROPE`, `GGML_ROPE_TYPE_VISION`

* fix rope op mode switching, out dated func args

* update `llama_hparams`

* update to keep up stream changes

* resolve linter, test errors

* add makefile entry, update speical image padding token

* add mrope unit test, fix few compiler warnings

* rename `mrope` related function, params

* minor updates on debug util, bug fixs

* add `m-rope` testcase to `test-backend-ops`

* Apply suggestions from code review

Co-authored-by: Georgi Gerganov <[email protected]>

* fix traililng whitespce

* store `llama_hparams.rope_sections` with fixed size array

* update position id tensor size check in GGML_OP_ROPE

* minor updates

* update `ggml_backend_*_supports_op` of unsupported backends

* remote old `rope_section` compare operator

---------

Co-authored-by: Georgi Gerganov <[email protected]>

ggml/include/ggml.h CHANGED
@@ -237,7 +237,9 @@
237
  #define GGML_EXIT_SUCCESS 0
238
  #define GGML_EXIT_ABORTED 1
239
 
240
- #define GGML_ROPE_TYPE_NEOX 2
 
 
241
 
242
  #define GGUF_MAGIC "GGUF"
243
 
@@ -1443,6 +1445,22 @@ extern "C" {
1443
  float beta_fast,
1444
  float beta_slow);
1445
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1446
  // in-place, returns view(a)
1447
  GGML_API struct ggml_tensor * ggml_rope_ext_inplace(
1448
  struct ggml_context * ctx,
 
237
  #define GGML_EXIT_SUCCESS 0
238
  #define GGML_EXIT_ABORTED 1
239
 
240
+ #define GGML_ROPE_TYPE_NEOX 2
241
+ #define GGML_ROPE_TYPE_MROPE 8
242
+ #define GGML_ROPE_TYPE_VISION 24
243
 
244
  #define GGUF_MAGIC "GGUF"
245
 
 
1445
  float beta_fast,
1446
  float beta_slow);
1447
 
1448
+ GGML_API struct ggml_tensor * ggml_rope_multi(
1449
+ struct ggml_context * ctx,
1450
+ struct ggml_tensor * a,
1451
+ struct ggml_tensor * b,
1452
+ struct ggml_tensor * c,
1453
+ int n_dims,
1454
+ int sections[4],
1455
+ int mode,
1456
+ int n_ctx_orig,
1457
+ float freq_base,
1458
+ float freq_scale,
1459
+ float ext_factor,
1460
+ float attn_factor,
1461
+ float beta_fast,
1462
+ float beta_slow);
1463
+
1464
  // in-place, returns view(a)
1465
  GGML_API struct ggml_tensor * ggml_rope_ext_inplace(
1466
  struct ggml_context * ctx,
ggml/src/ggml-cann/ggml-cann.cpp CHANGED
@@ -1747,6 +1747,15 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
1747
  if (*ext_factor != 0) {
1748
  return false;
1749
  }
 
 
 
 
 
 
 
 
 
1750
  return true;
1751
  }
1752
  case GGML_OP_UPSCALE: {
 
1747
  if (*ext_factor != 0) {
1748
  return false;
1749
  }
1750
+
1751
+ const int mode = ((const int32_t *) op->op_params)[2];
1752
+ if (mode & GGML_ROPE_TYPE_MROPE) {
1753
+ return false;
1754
+ }
1755
+ if (mode & GGML_ROPE_TYPE_VISION) {
1756
+ return false;
1757
+ }
1758
+
1759
  return true;
1760
  }
1761
  case GGML_OP_UPSCALE: {
ggml/src/ggml-cpu/ggml-cpu.c CHANGED
@@ -9133,6 +9133,64 @@ static void ggml_rope_cache_init(
9133
  }
9134
  }
9135
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9136
  static void ggml_compute_forward_rope_f32(
9137
  const struct ggml_compute_params * params,
9138
  struct ggml_tensor * dst,
@@ -9143,6 +9201,7 @@ static void ggml_compute_forward_rope_f32(
9143
  const struct ggml_tensor * src2 = dst->src[2];
9144
 
9145
  float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
 
9146
 
9147
  //const int n_past = ((int32_t *) dst->op_params)[0];
9148
  const int n_dims = ((int32_t *) dst->op_params)[1];
@@ -9156,6 +9215,7 @@ static void ggml_compute_forward_rope_f32(
9156
  memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
9157
  memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
9158
  memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
 
9159
 
9160
  GGML_TENSOR_UNARY_OP_LOCALS
9161
 
@@ -9188,6 +9248,16 @@ static void ggml_compute_forward_rope_f32(
9188
  ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
9189
 
9190
  const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
 
 
 
 
 
 
 
 
 
 
9191
 
9192
  const float * freq_factors = NULL;
9193
  if (src2 != NULL) {
@@ -9203,18 +9273,63 @@ static void ggml_compute_forward_rope_f32(
9203
 
9204
  const int32_t * pos = (const int32_t *) src1->data;
9205
 
9206
- for (int64_t i3 = 0; i3 < ne3; i3++) {
9207
- for (int64_t i2 = 0; i2 < ne2; i2++) {
9208
- const int64_t p = pos[i2];
9209
 
9210
  float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith;
9211
- ggml_rope_cache_init(p, freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
 
 
 
 
 
 
 
 
 
 
 
 
9212
 
9213
- for (int64_t i1 = 0; i1 < ne1; i1++) {
9214
  if (ir++ < ir0) continue;
9215
  if (ir > ir1) break;
9216
 
9217
- if (!is_neox) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9218
  for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
9219
  const float cos_theta = cache[i0 + 0];
9220
  const float sin_theta = cache[i0 + 1];
@@ -9228,8 +9343,10 @@ static void ggml_compute_forward_rope_f32(
9228
  dst_data[0] = x0*cos_theta - x1*sin_theta;
9229
  dst_data[1] = x0*sin_theta + x1*cos_theta;
9230
  }
9231
- } else {
9232
- for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
 
 
9233
  const int64_t ic = i0/2;
9234
 
9235
  const float cos_theta = cache[i0 + 0];
@@ -9239,19 +9356,20 @@ static void ggml_compute_forward_rope_f32(
9239
  float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
9240
 
9241
  const float x0 = src[0];
9242
- const float x1 = src[n_dims/2];
9243
 
9244
- dst_data[0] = x0*cos_theta - x1*sin_theta;
9245
- dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
9246
  }
9247
- }
9248
-
9249
- for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
9250
- const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
9251
- float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
9252
 
9253
- dst_data[0] = src[0];
9254
- dst_data[1] = src[1];
 
9255
  }
9256
  }
9257
  }
@@ -9269,6 +9387,7 @@ static void ggml_compute_forward_rope_f16(
9269
  const struct ggml_tensor * src2 = dst->src[2];
9270
 
9271
  float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
 
9272
 
9273
  //const int n_past = ((int32_t *) dst->op_params)[0];
9274
  const int n_dims = ((int32_t *) dst->op_params)[1];
@@ -9281,6 +9400,8 @@ static void ggml_compute_forward_rope_f16(
9281
  memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
9282
  memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
9283
  memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
 
 
9284
 
9285
  GGML_TENSOR_UNARY_OP_LOCALS
9286
 
@@ -9313,6 +9434,16 @@ static void ggml_compute_forward_rope_f16(
9313
  ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
9314
 
9315
  const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
 
 
 
 
 
 
 
 
 
 
9316
 
9317
  const float * freq_factors = NULL;
9318
  if (src2 != NULL) {
@@ -9330,16 +9461,61 @@ static void ggml_compute_forward_rope_f16(
9330
 
9331
  for (int64_t i3 = 0; i3 < ne3; i3++) {
9332
  for (int64_t i2 = 0; i2 < ne2; i2++) {
9333
- const int64_t p = pos[i2];
9334
 
9335
  float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith;
9336
- ggml_rope_cache_init(p, freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
 
 
 
 
 
 
 
 
 
 
 
 
9337
 
9338
  for (int64_t i1 = 0; i1 < ne1; i1++) {
9339
  if (ir++ < ir0) continue;
9340
  if (ir > ir1) break;
9341
 
9342
- if (!is_neox) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9343
  for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
9344
  const float cos_theta = cache[i0 + 0];
9345
  const float sin_theta = cache[i0 + 1];
@@ -9353,8 +9529,10 @@ static void ggml_compute_forward_rope_f16(
9353
  dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
9354
  dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
9355
  }
9356
- } else {
9357
- for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
 
 
9358
  const int64_t ic = i0/2;
9359
 
9360
  const float cos_theta = cache[i0 + 0];
@@ -9364,19 +9542,19 @@ static void ggml_compute_forward_rope_f16(
9364
  ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
9365
 
9366
  const float x0 = GGML_FP16_TO_FP32(src[0]);
9367
- const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
9368
 
9369
- dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
9370
- dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
9371
  }
9372
- }
9373
-
9374
- for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
9375
- const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
9376
- ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
9377
 
9378
- dst_data[0] = src[0];
9379
- dst_data[1] = src[1];
 
9380
  }
9381
  }
9382
  }
 
9133
  }
9134
  }
9135
 
9136
+ static void ggml_mrope_cache_init(
9137
+ float theta_base_t, float theta_base_h, float theta_base_w, float theta_base_e, int sections[4], bool indep_sects,
9138
+ float freq_scale, const float * freq_factors, float corr_dims[2], int64_t ne0, float ext_factor, float mscale,
9139
+ float * cache, float sin_sign, float theta_scale) {
9140
+ // ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py
9141
+ float theta_t = theta_base_t;
9142
+ float theta_h = theta_base_h;
9143
+ float theta_w = theta_base_w;
9144
+ float theta_e = theta_base_e; // extra position id for vision encoder
9145
+ int sect_dims = sections[0] + sections[1] + sections[2] + sections[3];
9146
+ int sec_w = sections[1] + sections[0];
9147
+ int sec_e = sections[2] + sec_w;
9148
+ GGML_ASSERT(sect_dims <= ne0);
9149
+
9150
+ for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
9151
+ const float ff = freq_factors ? freq_factors[i0/2] : 1.0f;
9152
+
9153
+ int sector = (i0 / 2) % sect_dims;
9154
+ if (indep_sects) {
9155
+ // compute theta independently for each dim sections
9156
+ // (i.e. reset corresponding theta when `i0` go from one section to another)
9157
+ if (sector == 0) {
9158
+ theta_t = theta_base_t;
9159
+ }
9160
+ else if (sector == sections[0]) {
9161
+ theta_h = theta_base_h;;
9162
+ }
9163
+ else if (sector == sec_w) {
9164
+ theta_w = theta_base_w;
9165
+ }
9166
+ else if (sector == sec_e) {
9167
+ theta_e = theta_base_e;
9168
+ }
9169
+ }
9170
+
9171
+ float theta = theta_t;
9172
+ if (sector >= sections[0] && sector < sec_w) {
9173
+ theta = theta_h;
9174
+ }
9175
+ else if (sector >= sec_w && sector < sec_w + sections[2]) {
9176
+ theta = theta_w;
9177
+ }
9178
+ else if (sector >= sec_w + sections[2]) {
9179
+ theta = theta_e;
9180
+ }
9181
+
9182
+ rope_yarn(
9183
+ theta/ff, freq_scale, corr_dims, i0, ext_factor, mscale, &cache[i0 + 0], &cache[i0 + 1]
9184
+ );
9185
+ cache[i0 + 1] *= sin_sign;
9186
+
9187
+ theta_t *= theta_scale;
9188
+ theta_w *= theta_scale;
9189
+ theta_h *= theta_scale;
9190
+ theta_e *= theta_scale;
9191
+ }
9192
+ }
9193
+
9194
  static void ggml_compute_forward_rope_f32(
9195
  const struct ggml_compute_params * params,
9196
  struct ggml_tensor * dst,
 
9201
  const struct ggml_tensor * src2 = dst->src[2];
9202
 
9203
  float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
9204
+ int sections[4];
9205
 
9206
  //const int n_past = ((int32_t *) dst->op_params)[0];
9207
  const int n_dims = ((int32_t *) dst->op_params)[1];
 
9215
  memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
9216
  memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
9217
  memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
9218
+ memcpy(&sections, (int32_t *) dst->op_params + 11, sizeof(int)*4);
9219
 
9220
  GGML_TENSOR_UNARY_OP_LOCALS
9221
 
 
9248
  ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
9249
 
9250
  const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
9251
+ const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE; // ggml_rope_multi, multimodal rotary position embedding
9252
+ const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
9253
+
9254
+ if (is_mrope) {
9255
+ GGML_ASSERT(sections[0] > 0 || sections[1] > 0 || sections[2] > 0);
9256
+ }
9257
+
9258
+ if (is_vision) {
9259
+ GGML_ASSERT(n_dims == ne0/2);
9260
+ }
9261
 
9262
  const float * freq_factors = NULL;
9263
  if (src2 != NULL) {
 
9273
 
9274
  const int32_t * pos = (const int32_t *) src1->data;
9275
 
9276
+ for (int64_t i3 = 0; i3 < ne3; i3++) { // batch
9277
+ for (int64_t i2 = 0; i2 < ne2; i2++) { // seq-len
 
9278
 
9279
  float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith;
9280
+ if (!is_mrope) {
9281
+ const int64_t p = pos[i2];
9282
+ ggml_rope_cache_init(p, freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
9283
+ }
9284
+ else {
9285
+ const int64_t p_t = pos[i2];
9286
+ const int64_t p_h = pos[i2 + ne2];
9287
+ const int64_t p_w = pos[i2 + ne2 * 2];
9288
+ const int64_t p_e = pos[i2 + ne2 * 3];
9289
+ ggml_mrope_cache_init(
9290
+ p_t, p_h, p_w, p_e, sections, is_vision,
9291
+ freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
9292
+ }
9293
 
9294
+ for (int64_t i1 = 0; i1 < ne1; i1++) { // attn-heads
9295
  if (ir++ < ir0) continue;
9296
  if (ir > ir1) break;
9297
 
9298
+ if (is_neox || is_mrope) {
9299
+ if (is_vision){
9300
+ for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
9301
+ const int64_t ic = i0/2;
9302
+
9303
+ const float cos_theta = cache[i0 + 0];
9304
+ const float sin_theta = cache[i0 + 1];
9305
+
9306
+ const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
9307
+ float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
9308
+
9309
+ const float x0 = src[0];
9310
+ const float x1 = src[n_dims];
9311
+
9312
+ dst_data[0] = x0*cos_theta - x1*sin_theta;
9313
+ dst_data[n_dims] = x0*sin_theta + x1*cos_theta;
9314
+ }
9315
+ } else {
9316
+ for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
9317
+ const int64_t ic = i0/2;
9318
+
9319
+ const float cos_theta = cache[i0 + 0];
9320
+ const float sin_theta = cache[i0 + 1];
9321
+
9322
+ const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
9323
+ float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
9324
+
9325
+ const float x0 = src[0];
9326
+ const float x1 = src[n_dims/2];
9327
+
9328
+ dst_data[0] = x0*cos_theta - x1*sin_theta;
9329
+ dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
9330
+ }
9331
+ }
9332
+ } else {
9333
  for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
9334
  const float cos_theta = cache[i0 + 0];
9335
  const float sin_theta = cache[i0 + 1];
 
9343
  dst_data[0] = x0*cos_theta - x1*sin_theta;
9344
  dst_data[1] = x0*sin_theta + x1*cos_theta;
9345
  }
9346
+ }
9347
+
9348
+ if (is_vision) {
9349
+ for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
9350
  const int64_t ic = i0/2;
9351
 
9352
  const float cos_theta = cache[i0 + 0];
 
9356
  float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
9357
 
9358
  const float x0 = src[0];
9359
+ const float x1 = src[n_dims];
9360
 
9361
+ dst_data[0] = x0*cos_theta - x1*sin_theta;
9362
+ dst_data[n_dims] = x0*sin_theta + x1*cos_theta;
9363
  }
9364
+ } else {
9365
+ // fill the remain channels with data from src tensor
9366
+ for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
9367
+ const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
9368
+ float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
9369
 
9370
+ dst_data[0] = src[0];
9371
+ dst_data[1] = src[1];
9372
+ }
9373
  }
9374
  }
9375
  }
 
9387
  const struct ggml_tensor * src2 = dst->src[2];
9388
 
9389
  float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
9390
+ int sections[4];
9391
 
9392
  //const int n_past = ((int32_t *) dst->op_params)[0];
9393
  const int n_dims = ((int32_t *) dst->op_params)[1];
 
9400
  memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
9401
  memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
9402
  memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
9403
+ memcpy(&sections, (int32_t *) dst->op_params + 11, sizeof(int)*4);
9404
+
9405
 
9406
  GGML_TENSOR_UNARY_OP_LOCALS
9407
 
 
9434
  ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
9435
 
9436
  const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
9437
+ const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;
9438
+ const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
9439
+
9440
+ if (is_mrope) {
9441
+ GGML_ASSERT(sections[0] > 0 || sections[1] > 0 || sections[2] > 0);
9442
+ }
9443
+
9444
+ if (is_vision) {
9445
+ GGML_ASSERT(n_dims == ne0/2);
9446
+ }
9447
 
9448
  const float * freq_factors = NULL;
9449
  if (src2 != NULL) {
 
9461
 
9462
  for (int64_t i3 = 0; i3 < ne3; i3++) {
9463
  for (int64_t i2 = 0; i2 < ne2; i2++) {
 
9464
 
9465
  float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith;
9466
+ if (!is_mrope) {
9467
+ const int64_t p = pos[i2];
9468
+ ggml_rope_cache_init(p, freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
9469
+ }
9470
+ else {
9471
+ const int64_t p_t = pos[i2];
9472
+ const int64_t p_h = pos[i2 + ne2];
9473
+ const int64_t p_w = pos[i2 + ne2 * 2];
9474
+ const int64_t p_e = pos[i2 + ne2 * 3];
9475
+ ggml_mrope_cache_init(
9476
+ p_t, p_h, p_w, p_e, sections, is_vision,
9477
+ freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
9478
+ }
9479
 
9480
  for (int64_t i1 = 0; i1 < ne1; i1++) {
9481
  if (ir++ < ir0) continue;
9482
  if (ir > ir1) break;
9483
 
9484
+ if (is_neox || is_mrope) {
9485
+ if (is_vision) {
9486
+ for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
9487
+ const int64_t ic = i0/2;
9488
+
9489
+ const float cos_theta = cache[i0 + 0];
9490
+ const float sin_theta = cache[i0 + 1];
9491
+
9492
+ const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
9493
+ ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
9494
+
9495
+ const float x0 = GGML_FP16_TO_FP32(src[0]);
9496
+ const float x1 = GGML_FP16_TO_FP32(src[n_dims]);
9497
+
9498
+ dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
9499
+ dst_data[n_dims] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
9500
+ }
9501
+ } else {
9502
+ for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
9503
+ const int64_t ic = i0/2;
9504
+
9505
+ const float cos_theta = cache[i0 + 0];
9506
+ const float sin_theta = cache[i0 + 1];
9507
+
9508
+ const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
9509
+ ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
9510
+
9511
+ const float x0 = GGML_FP16_TO_FP32(src[0]);
9512
+ const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
9513
+
9514
+ dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
9515
+ dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
9516
+ }
9517
+ }
9518
+ } else {
9519
  for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
9520
  const float cos_theta = cache[i0 + 0];
9521
  const float sin_theta = cache[i0 + 1];
 
9529
  dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
9530
  dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
9531
  }
9532
+ }
9533
+
9534
+ if (is_vision) {
9535
+ for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
9536
  const int64_t ic = i0/2;
9537
 
9538
  const float cos_theta = cache[i0 + 0];
 
9542
  ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
9543
 
9544
  const float x0 = GGML_FP16_TO_FP32(src[0]);
9545
+ const float x1 = GGML_FP16_TO_FP32(src[n_dims]);
9546
 
9547
+ dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
9548
+ dst_data[n_dims] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
9549
  }
9550
+ } else {
9551
+ for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
9552
+ const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
9553
+ ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
 
9554
 
9555
+ dst_data[0] = src[0];
9556
+ dst_data[1] = src[1];
9557
+ }
9558
  }
9559
  }
9560
  }
ggml/src/ggml-cuda/rope.cu CHANGED
@@ -4,6 +4,11 @@ struct rope_corr_dims {
4
  float v[2];
5
  };
6
 
 
 
 
 
 
7
  static __device__ float rope_yarn_ramp(const float low, const float high, const int i0) {
8
  const float y = (i0 / 2 - low) / max(0.001f, high - low);
9
  return 1.0f - min(1.0f, max(0.0f, y));
@@ -108,6 +113,105 @@ static __global__ void rope_neox(
108
  dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta;
109
  }
110
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
  template<typename T>
112
  static void rope_norm_cuda(
113
  const T * x, T * dst, int ne0, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
@@ -156,6 +260,56 @@ static void rope_neox_cuda(
156
  }
157
  }
158
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
  static void rope_norm_cuda_f16(
160
  const half * x, half * dst, int ne0, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
161
  float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream) {
@@ -185,6 +339,38 @@ static void rope_neox_cuda_f32(
185
  rope_neox_cuda<float>(x, dst, ne0, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, stream);
186
  }
187
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
188
  void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
189
  const ggml_tensor * src0 = dst->src[0];
190
  const ggml_tensor * src1 = dst->src[1];
@@ -201,8 +387,9 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
201
  GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
202
  GGML_ASSERT(src0->type == dst->type);
203
 
204
- const int64_t ne00 = src0->ne[0];
205
- const int64_t ne01 = src0->ne[1];
 
206
  const int64_t nr = ggml_nrows(src0);
207
 
208
  //const int n_past = ((int32_t *) dst->op_params)[0];
@@ -210,6 +397,7 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
210
  const int mode = ((int32_t *) dst->op_params)[2];
211
  //const int n_ctx = ((int32_t *) dst->op_params)[3];
212
  const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
 
213
 
214
  // RoPE alteration for extended context
215
  float freq_base;
@@ -225,8 +413,19 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
225
  memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
226
  memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
227
  memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
 
228
 
229
  const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
 
 
 
 
 
 
 
 
 
 
230
 
231
  const int32_t * pos = (const int32_t *) src1_d;
232
 
@@ -253,6 +452,34 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
253
  } else {
254
  GGML_ABORT("fatal error");
255
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
256
  } else {
257
  if (src0->type == GGML_TYPE_F32) {
258
  rope_norm_cuda_f32(
 
4
  float v[2];
5
  };
6
 
7
+
8
+ struct mrope_sections {
9
+ int v[4];
10
+ };
11
+
12
  static __device__ float rope_yarn_ramp(const float low, const float high, const int i0) {
13
  const float y = (i0 / 2 - low) / max(0.001f, high - low);
14
  return 1.0f - min(1.0f, max(0.0f, y));
 
113
  dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta;
114
  }
115
 
116
+ template<typename T, bool has_ff>
117
+ static __global__ void rope_multi(
118
+ const T * x, T * dst, int ne0, int ne2, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
119
+ float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, const float * freq_factors, mrope_sections sections) {
120
+ const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y);
121
+
122
+ if (i0 >= ne0) {
123
+ return;
124
+ }
125
+
126
+ const int row = blockDim.x*blockIdx.x + threadIdx.x;
127
+
128
+ if (i0 >= n_dims) {
129
+ const int i = row*ne0 + i0;
130
+
131
+ dst[i + 0] = x[i + 0];
132
+ dst[i + 1] = x[i + 1];
133
+
134
+ return;
135
+ }
136
+
137
+ const int i = row*ne0 + i0/2;
138
+ const int i2 = row/p_delta_rows;
139
+
140
+ int sect_dims = sections.v[0] + sections.v[1] + sections.v[2] + sections.v[3];
141
+ int sec_w = sections.v[1] + sections.v[0];
142
+ int sector = (i0 / 2) % sect_dims;
143
+
144
+ float theta_base = 0.0;
145
+ if (sector < sections.v[0]) {
146
+ theta_base = pos[i2]*powf(theta_scale, i0/2.0f);
147
+ }
148
+ else if (sector >= sections.v[0] && sector < sec_w) {
149
+ theta_base = pos[i2 + ne2 * 1]*powf(theta_scale, i0/2.0f);
150
+ }
151
+ else if (sector >= sec_w && sector < sec_w + sections.v[2]) {
152
+ theta_base = pos[i2 + ne2 * 2]*powf(theta_scale, i0/2.0f);
153
+ }
154
+ else if (sector >= sec_w + sections.v[2]) {
155
+ theta_base = pos[i2 + ne2 * 3]*powf(theta_scale, i0/2.0f);
156
+ }
157
+
158
+ const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
159
+
160
+ float cos_theta;
161
+ float sin_theta;
162
+
163
+ rope_yarn(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta);
164
+
165
+ const float x0 = x[i + 0];
166
+ const float x1 = x[i + n_dims/2];
167
+
168
+ dst[i + 0] = x0*cos_theta - x1*sin_theta;
169
+ dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta;
170
+ }
171
+
172
+ template<typename T, bool has_ff>
173
+ static __global__ void rope_vision(
174
+ const T * x, T * dst, int ne0, int ne2, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
175
+ float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, const float * freq_factors, mrope_sections sections) {
176
+ const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y);
177
+
178
+ if (i0 >= ne0) {
179
+ return;
180
+ }
181
+
182
+ const int row = blockDim.x*blockIdx.x + threadIdx.x;
183
+
184
+ const int i = row*ne0 + i0/2;
185
+ const int i2 = row/p_delta_rows; // i2-th tokens
186
+
187
+ int sect_dims = sections.v[0] + sections.v[1];
188
+ int sec_w = sections.v[1] + sections.v[0];
189
+ int sector = (i0 / 2) % sect_dims;
190
+
191
+ float theta_base = 0.0;
192
+ if (sector < sections.v[0]) {
193
+ const int p = sector;
194
+ theta_base = pos[i2]*powf(theta_scale, p);
195
+ }
196
+ else if (sector >= sections.v[0] && sector < sec_w) {
197
+ const int p = sector - sections.v[0];
198
+ theta_base = pos[i2 + ne2]*powf(theta_scale, p);
199
+ }
200
+
201
+ const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
202
+
203
+ float cos_theta;
204
+ float sin_theta;
205
+
206
+ rope_yarn(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta);
207
+
208
+ const float x0 = x[i + 0];
209
+ const float x1 = x[i + n_dims];
210
+
211
+ dst[i + 0] = x0*cos_theta - x1*sin_theta;
212
+ dst[i + n_dims] = x0*sin_theta + x1*cos_theta;
213
+ }
214
+
215
  template<typename T>
216
  static void rope_norm_cuda(
217
  const T * x, T * dst, int ne0, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
 
260
  }
261
  }
262
 
263
+ template<typename T>
264
+ static void rope_multi_cuda(
265
+ const T * x, T * dst, int ne0, int ne2, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
266
+ float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, mrope_sections sections, cudaStream_t stream) {
267
+ GGML_ASSERT(ne0 % 2 == 0);
268
+ const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
269
+ const int n_blocks_x = (ne0 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
270
+ const dim3 block_nums(nr, n_blocks_x, 1);
271
+
272
+ const float theta_scale = powf(freq_base, -2.0f/n_dims);
273
+
274
+ if (freq_factors == nullptr) {
275
+ rope_multi<T, false><<<block_nums, block_dims, 0, stream>>>(
276
+ x, dst, ne0, ne2, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
277
+ theta_scale, freq_factors, sections
278
+ );
279
+ } else {
280
+ rope_multi<T, true><<<block_nums, block_dims, 0, stream>>>(
281
+ x, dst, ne0, ne2, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
282
+ theta_scale, freq_factors, sections
283
+ );
284
+ }
285
+ }
286
+
287
+ template<typename T>
288
+ static void rope_vision_cuda(
289
+ const T * x, T * dst, int ne0, int ne2, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
290
+ float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, mrope_sections sections, cudaStream_t stream) {
291
+ GGML_ASSERT(ne0 % 2 == 0);
292
+ const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
293
+ const int n_blocks_x = (ne0 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
294
+ const dim3 block_nums(nr, n_blocks_x, 1);
295
+ // break down (head_dim, heads, seq) into (CUDA_ROPE_BLOCK_SIZE, x, heads * seq)
296
+ // where x ~= ceil(head_dim / CUDA_ROPE_BLOCK_SIZE);
297
+
298
+ const float theta_scale = powf(freq_base, -2.0f/n_dims);
299
+
300
+ if (freq_factors == nullptr) {
301
+ rope_vision<T, false><<<block_nums, block_dims, 0, stream>>>(
302
+ x, dst, ne0, ne2, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
303
+ theta_scale, freq_factors, sections
304
+ );
305
+ } else {
306
+ rope_vision<T, true><<<block_nums, block_dims, 0, stream>>>(
307
+ x, dst, ne0, ne2, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
308
+ theta_scale, freq_factors, sections
309
+ );
310
+ }
311
+ }
312
+
313
  static void rope_norm_cuda_f16(
314
  const half * x, half * dst, int ne0, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
315
  float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream) {
 
339
  rope_neox_cuda<float>(x, dst, ne0, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, stream);
340
  }
341
 
342
+ static void rope_multi_cuda_f16(
343
+ const half * x, half * dst, int ne0, int ne2, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
344
+ float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, mrope_sections sections, cudaStream_t stream
345
+ ) {
346
+
347
+ rope_multi_cuda<half>(x, dst, ne0, ne2, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream);
348
+ }
349
+
350
+ static void rope_multi_cuda_f32(
351
+ const float * x, float * dst, int ne0, int ne2, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
352
+ float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, mrope_sections sections, cudaStream_t stream
353
+ ) {
354
+
355
+ rope_multi_cuda<float>(x, dst, ne0, ne2, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream);
356
+ }
357
+
358
+ static void rope_vision_cuda_f16(
359
+ const half * x, half * dst, int ne0, int ne2, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
360
+ float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, mrope_sections sections, cudaStream_t stream
361
+ ) {
362
+
363
+ rope_vision_cuda<half>(x, dst, ne0, ne2, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream);
364
+ }
365
+
366
+ static void rope_vision_cuda_f32(
367
+ const float * x, float * dst, int ne0, int ne2, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
368
+ float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, mrope_sections sections, cudaStream_t stream
369
+ ) {
370
+
371
+ rope_vision_cuda<float>(x, dst, ne0, ne2, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream);
372
+ }
373
+
374
  void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
375
  const ggml_tensor * src0 = dst->src[0];
376
  const ggml_tensor * src1 = dst->src[1];
 
387
  GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
388
  GGML_ASSERT(src0->type == dst->type);
389
 
390
+ const int64_t ne00 = src0->ne[0]; // head dims
391
+ const int64_t ne01 = src0->ne[1]; // num heads
392
+ const int64_t ne02 = src0->ne[2]; // num heads
393
  const int64_t nr = ggml_nrows(src0);
394
 
395
  //const int n_past = ((int32_t *) dst->op_params)[0];
 
397
  const int mode = ((int32_t *) dst->op_params)[2];
398
  //const int n_ctx = ((int32_t *) dst->op_params)[3];
399
  const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
400
+ mrope_sections sections;
401
 
402
  // RoPE alteration for extended context
403
  float freq_base;
 
413
  memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
414
  memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
415
  memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
416
+ memcpy(&sections.v, (int32_t *) dst->op_params + 11, sizeof(int)*4);
417
 
418
  const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
419
+ const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;
420
+ const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
421
+
422
+ if (is_mrope) {
423
+ GGML_ASSERT(sections.v[0] > 0 || sections.v[1] > 0 || sections.v[2] > 0);
424
+ }
425
+
426
+ if (is_vision) {
427
+ GGML_ASSERT(n_dims == ne00/2);
428
+ }
429
 
430
  const int32_t * pos = (const int32_t *) src1_d;
431
 
 
452
  } else {
453
  GGML_ABORT("fatal error");
454
  }
455
+ } else if (is_mrope && !is_vision) {
456
+ if (src0->type == GGML_TYPE_F32) {
457
+ rope_multi_cuda_f32(
458
+ (const float *)src0_d, (float *)dst_d, ne00, ne02, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
459
+ attn_factor, corr_dims, freq_factors, sections, stream
460
+ );
461
+ } else if (src0->type == GGML_TYPE_F16) {
462
+ rope_multi_cuda_f16(
463
+ (const half *)src0_d, (half *)dst_d, ne00, ne02, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
464
+ attn_factor, corr_dims, freq_factors, sections, stream
465
+ );
466
+ } else {
467
+ GGML_ABORT("fatal error");
468
+ }
469
+ } else if (is_vision) {
470
+ if (src0->type == GGML_TYPE_F32) {
471
+ rope_vision_cuda_f32(
472
+ (const float *)src0_d, (float *)dst_d, ne00, ne02, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
473
+ attn_factor, corr_dims, freq_factors, sections, stream
474
+ );
475
+ } else if (src0->type == GGML_TYPE_F16) {
476
+ rope_vision_cuda_f16(
477
+ (const half *)src0_d, (half *)dst_d, ne00, ne02, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
478
+ attn_factor, corr_dims, freq_factors, sections, stream
479
+ );
480
+ } else {
481
+ GGML_ABORT("fatal error");
482
+ }
483
  } else {
484
  if (src0->type == GGML_TYPE_F32) {
485
  rope_norm_cuda_f32(
ggml/src/ggml-kompute/ggml-kompute.cpp CHANGED
@@ -1419,8 +1419,18 @@ static bool ggml_backend_kompute_device_supports_op(ggml_backend_dev_t dev, cons
1419
  case GGML_OP_SOFT_MAX:
1420
  case GGML_OP_RMS_NORM:
1421
  case GGML_OP_NORM:
1422
- case GGML_OP_ROPE:
1423
  return true;
 
 
 
 
 
 
 
 
 
 
 
1424
  case GGML_OP_DUP:
1425
  case GGML_OP_CPY:
1426
  case GGML_OP_CONT:
 
1419
  case GGML_OP_SOFT_MAX:
1420
  case GGML_OP_RMS_NORM:
1421
  case GGML_OP_NORM:
 
1422
  return true;
1423
+ case GGML_OP_ROPE:
1424
+ {
1425
+ const int mode = ((const int32_t *) op->op_params)[2];
1426
+ if (mode & GGML_ROPE_TYPE_MROPE) {
1427
+ return false;
1428
+ }
1429
+ if (mode & GGML_ROPE_TYPE_VISION) {
1430
+ return false;
1431
+ }
1432
+ return true;
1433
+ }
1434
  case GGML_OP_DUP:
1435
  case GGML_OP_CPY:
1436
  case GGML_OP_CONT:
ggml/src/ggml-metal/ggml-metal.m CHANGED
@@ -1125,8 +1125,18 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
1125
  return has_simdgroup_reduction && (op->ne[0] % 4 == 0);
1126
  case GGML_OP_ARGMAX:
1127
  case GGML_OP_NORM:
1128
- case GGML_OP_ROPE:
1129
  return true;
 
 
 
 
 
 
 
 
 
 
 
1130
  case GGML_OP_IM2COL:
1131
  return op->src[0]->type == GGML_TYPE_F16;
1132
  case GGML_OP_POOL_1D:
@@ -3026,7 +3036,9 @@ static void ggml_metal_encode_node(
3026
  } break;
3027
  case GGML_OP_ROPE:
3028
  {
3029
- GGML_ASSERT(ne10 == ne02);
 
 
3030
 
3031
  const int nth = MIN(1024, ne00);
3032
 
 
1125
  return has_simdgroup_reduction && (op->ne[0] % 4 == 0);
1126
  case GGML_OP_ARGMAX:
1127
  case GGML_OP_NORM:
 
1128
  return true;
1129
+ case GGML_OP_ROPE:
1130
+ {
1131
+ const int mode = ((const int32_t *) op->op_params)[2];
1132
+ if (mode & GGML_ROPE_TYPE_MROPE) {
1133
+ return false;
1134
+ }
1135
+ if (mode & GGML_ROPE_TYPE_VISION) {
1136
+ return false;
1137
+ }
1138
+ return true;
1139
+ }
1140
  case GGML_OP_IM2COL:
1141
  return op->src[0]->type == GGML_TYPE_F16;
1142
  case GGML_OP_POOL_1D:
 
3036
  } break;
3037
  case GGML_OP_ROPE:
3038
  {
3039
+ // make sure we have one or more position id(ne10) per token(ne02)
3040
+ GGML_ASSERT(ne10 % ne02 == 0);
3041
+ GGML_ASSERT(ne10 >= ne02);
3042
 
3043
  const int nth = MIN(1024, ne00);
3044
 
ggml/src/ggml-sycl/ggml-sycl.cpp CHANGED
@@ -4488,7 +4488,16 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
4488
  case GGML_OP_SOFT_MAX:
4489
  return true;
4490
  case GGML_OP_ROPE:
4491
- return ggml_is_contiguous(op->src[0]);
 
 
 
 
 
 
 
 
 
4492
  case GGML_OP_IM2COL:
4493
  // TODO: add support for the new F32 operations
4494
  return op->src[0]->type == GGML_TYPE_F16;
 
4488
  case GGML_OP_SOFT_MAX:
4489
  return true;
4490
  case GGML_OP_ROPE:
4491
+ {
4492
+ const int mode = ((const int32_t *) op->op_params)[2];
4493
+ if (mode & GGML_ROPE_TYPE_MROPE) {
4494
+ return false;
4495
+ }
4496
+ if (mode & GGML_ROPE_TYPE_VISION) {
4497
+ return false;
4498
+ }
4499
+ return ggml_is_contiguous(op->src[0]);
4500
+ }
4501
  case GGML_OP_IM2COL:
4502
  // TODO: add support for the new F32 operations
4503
  return op->src[0]->type == GGML_TYPE_F16;
ggml/src/ggml-vulkan/ggml-vulkan.cpp CHANGED
@@ -7687,7 +7687,16 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
7687
  case GGML_OP_REPEAT:
7688
  return ggml_type_size(op->type) == sizeof(float) && ggml_type_size(op->src[0]->type) == sizeof(float);
7689
  case GGML_OP_ROPE:
7690
- return ggml_is_contiguous(op->src[0]);
 
 
 
 
 
 
 
 
 
7691
  case GGML_OP_NONE:
7692
  case GGML_OP_RESHAPE:
7693
  case GGML_OP_VIEW:
 
7687
  case GGML_OP_REPEAT:
7688
  return ggml_type_size(op->type) == sizeof(float) && ggml_type_size(op->src[0]->type) == sizeof(float);
7689
  case GGML_OP_ROPE:
7690
+ {
7691
+ const int mode = ((const int32_t *) op->op_params)[2];
7692
+ if (mode & GGML_ROPE_TYPE_MROPE) {
7693
+ return false;
7694
+ }
7695
+ if (mode & GGML_ROPE_TYPE_VISION) {
7696
+ return false;
7697
+ }
7698
+ return ggml_is_contiguous(op->src[0]);
7699
+ }
7700
  case GGML_OP_NONE:
7701
  case GGML_OP_RESHAPE:
7702
  case GGML_OP_VIEW:
ggml/src/ggml.c CHANGED
@@ -3517,15 +3517,18 @@ static struct ggml_tensor * ggml_rope_impl(
3517
  GGML_ASSERT(c->ne[0] >= n_dims / 2);
3518
  }
3519
 
 
 
3520
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
3521
 
3522
- int32_t params[11] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
3523
  memcpy(params + 5, &freq_base, sizeof(float));
3524
  memcpy(params + 6, &freq_scale, sizeof(float));
3525
  memcpy(params + 7, &ext_factor, sizeof(float));
3526
  memcpy(params + 8, &attn_factor, sizeof(float));
3527
  memcpy(params + 9, &beta_fast, sizeof(float));
3528
  memcpy(params + 10, &beta_slow, sizeof(float));
 
3529
  ggml_set_op_params(result, params, sizeof(params));
3530
 
3531
  result->op = GGML_OP_ROPE;
@@ -3547,6 +3550,53 @@ struct ggml_tensor * ggml_rope(
3547
  );
3548
  }
3549
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3550
  struct ggml_tensor * ggml_rope_inplace(
3551
  struct ggml_context * ctx,
3552
  struct ggml_tensor * a,
 
3517
  GGML_ASSERT(c->ne[0] >= n_dims / 2);
3518
  }
3519
 
3520
+ int sections[4] = {0, 0, 0, 0};
3521
+
3522
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
3523
 
3524
+ int32_t params[15] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
3525
  memcpy(params + 5, &freq_base, sizeof(float));
3526
  memcpy(params + 6, &freq_scale, sizeof(float));
3527
  memcpy(params + 7, &ext_factor, sizeof(float));
3528
  memcpy(params + 8, &attn_factor, sizeof(float));
3529
  memcpy(params + 9, &beta_fast, sizeof(float));
3530
  memcpy(params + 10, &beta_slow, sizeof(float));
3531
+ memcpy(params + 11, &sections, sizeof(int)*4);
3532
  ggml_set_op_params(result, params, sizeof(params));
3533
 
3534
  result->op = GGML_OP_ROPE;
 
3550
  );
3551
  }
3552
 
3553
+ struct ggml_tensor * ggml_rope_multi(
3554
+ struct ggml_context * ctx,
3555
+ struct ggml_tensor * a,
3556
+ struct ggml_tensor * b,
3557
+ struct ggml_tensor * c,
3558
+ int n_dims,
3559
+ int sections[4],
3560
+ int mode,
3561
+ int n_ctx_orig,
3562
+ float freq_base,
3563
+ float freq_scale,
3564
+ float ext_factor,
3565
+ float attn_factor,
3566
+ float beta_fast,
3567
+ float beta_slow) {
3568
+ // Multimodal Rotary Position Embedding
3569
+ GGML_ASSERT((mode & 1) == 0 && "mode & 1 == 1 is no longer supported");
3570
+
3571
+ GGML_ASSERT(ggml_is_vector(b));
3572
+ GGML_ASSERT(b->type == GGML_TYPE_I32);
3573
+ GGML_ASSERT(a->ne[2] * 4 == b->ne[0]); // mrope expecting 4 position ids per token
3574
+
3575
+ if (c) {
3576
+ GGML_ASSERT(c->type == GGML_TYPE_F32);
3577
+ GGML_ASSERT(c->ne[0] >= n_dims / 2);
3578
+ }
3579
+
3580
+ struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
3581
+
3582
+ int32_t params[11 + 4] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
3583
+ memcpy(params + 5, &freq_base, sizeof(float));
3584
+ memcpy(params + 6, &freq_scale, sizeof(float));
3585
+ memcpy(params + 7, &ext_factor, sizeof(float));
3586
+ memcpy(params + 8, &attn_factor, sizeof(float));
3587
+ memcpy(params + 9, &beta_fast, sizeof(float));
3588
+ memcpy(params + 10, &beta_slow, sizeof(float));
3589
+ memcpy(&params[11], sections, sizeof(int)*4);
3590
+ ggml_set_op_params(result, params, sizeof(params));
3591
+
3592
+ result->op = GGML_OP_ROPE;
3593
+ result->src[0] = a;
3594
+ result->src[1] = b;
3595
+ result->src[2] = c;
3596
+
3597
+ return result;
3598
+ }
3599
+
3600
  struct ggml_tensor * ggml_rope_inplace(
3601
  struct ggml_context * ctx,
3602
  struct ggml_tensor * a,