Vineel Abhinav commited on
Commit
7941e9b
·
1 Parent(s): d9bd7ce

ggml: aarch64: Implement SVE F32 kernels for vector functions (llama/13843)

Browse files

* F32-Mamba-SVE

* F32-Mamba-SVE

* Resolve test errors-1

* Resolve test errors-2

* F32-vec-SVE

* F32-vec-SVE

* F32-vec-SVE

ggml/src/ggml-cpu/ops.cpp CHANGED
@@ -7641,8 +7641,8 @@ static void ggml_compute_forward_ssm_scan_f32(
7641
  const float * A = (const float *) ((const char *) src3->data + ir0*(src3->nb[1])); // {d_state, d_inner}
7642
  const float * B = (const float *) ((const char *) src4->data + i2*(src4->nb[1]) + i3*(src4->nb[2])); // {d_state, n_t, n_s}
7643
  const float * C = (const float *) ((const char *) src5->data + i2*(src5->nb[1]) + i3*(src5->nb[2])); // {d_state, n_t, n_s}
7644
- float * y = ( float *) (( char *) dst->data + ir0*(src1->nb[0]) + i2*(src1->nb[1]) + i3*(src1->nb[2])); // {d_inner, n_t, n_s}
7645
- float * s = ( float *) (( char *) dst->data + ir0*(src0->nb[1]) + i3*(src0->nb[2]) + src1->nb[3]); // {d_state, d_inner, n_s}
7646
 
7647
  // use the output as the source for the next token-wise iterations
7648
  if (i2 > 0) { s0 = s; }
@@ -8070,6 +8070,14 @@ static void ggml_compute_forward_rwkv_wkv6_f32(
8070
  #define GGML_F32X_MUL GGML_F32x16_MUL
8071
  #define GGML_F32X_FMA GGML_F32x16_FMA
8072
  #define WKV_VECTOR_SIZE 16
 
 
 
 
 
 
 
 
8073
  #elif defined(__ARM_NEON) && defined(__aarch64__)
8074
  #define GGML_F32X GGML_F32x4
8075
  #define GGML_F32X_SET1 GGML_F32x4_SET1
@@ -8080,8 +8088,14 @@ static void ggml_compute_forward_rwkv_wkv6_f32(
8080
  #define WKV_VECTOR_SIZE 4
8081
  #endif
8082
 
 
8083
  #ifdef WKV_VECTOR_SIZE
8084
- const int64_t vec_count = head_size / WKV_VECTOR_SIZE;
 
 
 
 
 
8085
 
8086
  for (int64_t t = 0; t < T; t++) {
8087
  size_t t_offset = t * t_stride;
@@ -8111,7 +8125,7 @@ static void ggml_compute_forward_rwkv_wkv6_f32(
8111
  GGML_F32X time_decay_vec = GGML_F32X_SET1(time_decay_val);
8112
 
8113
  for (int64_t j = 0; j < vec_count; j++) {
8114
- size_t base_j = j * WKV_VECTOR_SIZE;
8115
  size_t t_h_j_offset = t_h_offset + base_j;
8116
  size_t h_2d_i_j_offset = h_2d_i_offset + base_j;
8117
 
@@ -8136,7 +8150,7 @@ static void ggml_compute_forward_rwkv_wkv6_f32(
8136
  }
8137
 
8138
  // Handle remaining elements, this will not be used.
8139
- for (int64_t j = vec_count * WKV_VECTOR_SIZE; j < head_size; j++) {
8140
  size_t t_h_j_offset = t_h_offset + j;
8141
  size_t h_2d_i_j_offset = h_2d_i_offset + j;
8142
  float v_val = v[t_h_j_offset];
@@ -8272,6 +8286,14 @@ static void ggml_compute_forward_gla_f32(
8272
  #define GGML_F32X_MUL GGML_F32x16_MUL
8273
  #define GGML_F32X_FMA GGML_F32x16_FMA
8274
  #define GLA_VECTOR_SIZE 16
 
 
 
 
 
 
 
 
8275
  #elif defined(__ARM_NEON) && defined(__aarch64__)
8276
  #define GGML_F32X GGML_F32x4
8277
  #define GGML_F32X_SET1 GGML_F32x4_SET1
@@ -8282,8 +8304,14 @@ static void ggml_compute_forward_gla_f32(
8282
  #define GLA_VECTOR_SIZE 4
8283
  #endif
8284
 
 
8285
  #ifdef GLA_VECTOR_SIZE
8286
- const int64_t vec_count = head_size / GLA_VECTOR_SIZE;
 
 
 
 
 
8287
 
8288
  for (int64_t t = 0; t < T; t++) {
8289
  size_t t_offset = t * t_stride;
@@ -8310,7 +8338,7 @@ static void ggml_compute_forward_gla_f32(
8310
  GGML_F32X g_vec = GGML_F32X_SET1(g_val);
8311
 
8312
  for (int64_t j = 0; j < vec_count; j++) {
8313
- size_t base_j = j * GLA_VECTOR_SIZE;
8314
  size_t t_h_j_offset = t_h_offset + base_j;
8315
  size_t h_2d_i_j_offset = h_2d_i_offset + base_j;
8316
 
@@ -8334,7 +8362,7 @@ static void ggml_compute_forward_gla_f32(
8334
  }
8335
 
8336
  // Handle remaining elements, this will not be used.
8337
- for (int64_t j = vec_count * GLA_VECTOR_SIZE; j < head_size; j++) {
8338
  size_t t_h_j_offset = t_h_offset + j;
8339
  size_t h_2d_i_j_offset = h_2d_i_offset + j;
8340
  float v_val = v[t_h_j_offset];
@@ -8443,83 +8471,126 @@ static void ggml_compute_forward_rwkv_wkv7_f32(
8443
  int64_t h_stride_2d = head_size * head_size;
8444
 
8445
  #if defined(GGML_SIMD)
8446
- for (int64_t t = 0; t < T; t++) {
8447
- int64_t t_offset = t * t_stride;
8448
- int64_t state_offset = head_size * C * (t / (T / n_seqs));
8449
- float * state_cur = state + state_offset;
8450
- float * state_prev = t % (T / n_seqs) ? state_cur : (float*)dst->src[6]->data + state_offset;
8451
-
8452
- for (int64_t h = h_start; h < h_end; h++) {
8453
- int64_t h_offset = h * h_stride;
8454
- int64_t t_h_offset = t_offset + h_offset;
8455
- int64_t h_2d_offset = h * h_stride_2d;
8456
-
8457
- for (int64_t ii = 0; ii < head_size; ii++) {
8458
- int64_t t_h_i_offset = t_h_offset + ii;
8459
- int64_t h_2d_i_offset = h_2d_offset + ii * h_stride;
8460
-
8461
- GGML_F32_VEC v_vec = GGML_F32_VEC_SET1(v[t_h_i_offset]);
 
 
 
 
 
 
 
8462
 
8463
- float sa = 0;
8464
- {
8465
- GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
8466
- GGML_F32_VEC ax[GGML_F32_ARR];
8467
- GGML_F32_VEC ay[GGML_F32_ARR];
8468
- for (int64_t j = 0; j < head_size; j += GGML_F32_STEP) {
8469
- for (int64_t kk = 0; kk < GGML_F32_ARR; kk++) {
8470
- ax[kk] = GGML_F32_VEC_LOAD(&a[t_h_offset + j + kk * GGML_F32_EPR]);
8471
- ay[kk] = GGML_F32_VEC_LOAD(&state_prev[h_2d_i_offset + j + kk * GGML_F32_EPR]);
8472
- sum[kk] = GGML_F32_VEC_FMA(sum[kk], ax[kk], ay[kk]);
8473
- }
 
8474
  }
8475
- GGML_F32_VEC_REDUCE(sa, sum);
8476
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8477
 
8478
- GGML_F32_VEC sa_vec = GGML_F32_VEC_SET1(sa);
8479
 
8480
- int64_t j = 0;
8481
- GGML_F32_VEC result_vec[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
8482
- for (; j < head_size; j += GGML_F32_STEP) {
8483
- for (int64_t kk = 0; kk < GGML_F32_ARR; kk++) {
8484
- int64_t t_h_j_offset = t_h_offset + j + kk * GGML_F32_EPR;
8485
- int64_t h_2d_i_j_offset = h_2d_i_offset + j + kk * GGML_F32_EPR;
8486
 
8487
- GGML_F32_VEC r_vec = GGML_F32_VEC_LOAD(&r[t_h_j_offset]);
8488
- GGML_F32_VEC w_vec = GGML_F32_VEC_LOAD(&w[t_h_j_offset]);
8489
- GGML_F32_VEC k_vec = GGML_F32_VEC_LOAD(&k[t_h_j_offset]);
8490
- GGML_F32_VEC b_vec = GGML_F32_VEC_LOAD(&b[t_h_j_offset]);
8491
 
8492
- k_vec = GGML_F32_VEC_MUL(v_vec, k_vec);
8493
 
8494
- GGML_F32_VEC state_vec = GGML_F32_VEC_LOAD(&state_prev[h_2d_i_j_offset]);
8495
- // kv + s * decay + sa * b
8496
- state_vec = GGML_F32_VEC_FMA(k_vec, state_vec, w_vec);
8497
- state_vec = GGML_F32_VEC_FMA(state_vec, sa_vec, b_vec);
8498
- GGML_F32_VEC_STORE(&state_cur[h_2d_i_j_offset], state_vec);
8499
 
8500
- result_vec[kk] = GGML_F32_VEC_FMA(result_vec[kk], state_vec, r_vec);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8501
  }
8502
- }
8503
- GGML_F32_VEC_REDUCE(dst_data[t_h_i_offset], result_vec);
8504
-
8505
- // There shouldn't be left-overs though.
8506
- for (; j < head_size; j++) {
8507
- int64_t t_h_j_offset = t_h_offset + j;
8508
- int64_t h_2d_i_j_offset = h_2d_i_offset + j;
8509
-
8510
- float r_val = r[t_h_j_offset];
8511
- float w_val = w[t_h_j_offset];
8512
- float k_val = k[t_h_j_offset];
8513
- float b_val = b[t_h_j_offset];
8514
- float kv_val = v[t_h_i_offset] * k_val;
8515
-
8516
- float prev_state_val = state_prev[h_2d_i_j_offset];
8517
- state_cur[h_2d_i_j_offset] = prev_state_val * w_val + kv_val + sa * b_val;
8518
- dst_data[t_h_i_offset] += state_cur[h_2d_i_j_offset] * r_val;
8519
  }
8520
  }
8521
  }
8522
- }
8523
  #else
8524
  for (int64_t t = 0; t < T; t++) {
8525
  int64_t t_offset = t * t_stride;
 
7641
  const float * A = (const float *) ((const char *) src3->data + ir0*(src3->nb[1])); // {d_state, d_inner}
7642
  const float * B = (const float *) ((const char *) src4->data + i2*(src4->nb[1]) + i3*(src4->nb[2])); // {d_state, n_t, n_s}
7643
  const float * C = (const float *) ((const char *) src5->data + i2*(src5->nb[1]) + i3*(src5->nb[2])); // {d_state, n_t, n_s}
7644
+ float * y = ( float *) (( char *) dst->data + ir0*(src1->nb[0]) + i2*(src1->nb[1]) + i3*(src1->nb[2])); // {d_inner, n_t, n_s}
7645
+ float * s = ( float *) (( char *) dst->data + ir0*(src0->nb[1]) + i3*(src0->nb[2]) + src1->nb[3]); // {d_state, d_inner, n_s}
7646
 
7647
  // use the output as the source for the next token-wise iterations
7648
  if (i2 > 0) { s0 = s; }
 
8070
  #define GGML_F32X_MUL GGML_F32x16_MUL
8071
  #define GGML_F32X_FMA GGML_F32x16_FMA
8072
  #define WKV_VECTOR_SIZE 16
8073
+ #elif defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
8074
+ #define GGML_F32X GGML_F32xt
8075
+ #define GGML_F32X_SET1 GGML_F32xt_SET1
8076
+ #define GGML_F32X_LOAD GGML_F32xt_LOAD
8077
+ #define GGML_F32X_STORE GGML_F32xt_STORE
8078
+ #define GGML_F32X_MUL GGML_F32xt_MUL
8079
+ #define GGML_F32X_FMA GGML_F32xt_FMA
8080
+ #define WKV_VECTOR_SIZE 8
8081
  #elif defined(__ARM_NEON) && defined(__aarch64__)
8082
  #define GGML_F32X GGML_F32x4
8083
  #define GGML_F32X_SET1 GGML_F32x4_SET1
 
8088
  #define WKV_VECTOR_SIZE 4
8089
  #endif
8090
 
8091
+ int wkv_vector_size;
8092
  #ifdef WKV_VECTOR_SIZE
8093
+ #if defined(__ARM_FEATURE_SVE)
8094
+ wkv_vector_size = svcntw();
8095
+ #else
8096
+ wkv_vector_size = WKV_VECTOR_SIZE;
8097
+ #endif
8098
+ const int64_t vec_count = head_size / wkv_vector_size;
8099
 
8100
  for (int64_t t = 0; t < T; t++) {
8101
  size_t t_offset = t * t_stride;
 
8125
  GGML_F32X time_decay_vec = GGML_F32X_SET1(time_decay_val);
8126
 
8127
  for (int64_t j = 0; j < vec_count; j++) {
8128
+ size_t base_j = j * wkv_vector_size;
8129
  size_t t_h_j_offset = t_h_offset + base_j;
8130
  size_t h_2d_i_j_offset = h_2d_i_offset + base_j;
8131
 
 
8150
  }
8151
 
8152
  // Handle remaining elements, this will not be used.
8153
+ for (int64_t j = vec_count * wkv_vector_size; j < head_size; j++) {
8154
  size_t t_h_j_offset = t_h_offset + j;
8155
  size_t h_2d_i_j_offset = h_2d_i_offset + j;
8156
  float v_val = v[t_h_j_offset];
 
8286
  #define GGML_F32X_MUL GGML_F32x16_MUL
8287
  #define GGML_F32X_FMA GGML_F32x16_FMA
8288
  #define GLA_VECTOR_SIZE 16
8289
+ #elif defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
8290
+ #define GGML_F32X GGML_F32xt
8291
+ #define GGML_F32X_SET1 GGML_F32xt_SET1
8292
+ #define GGML_F32X_LOAD GGML_F32xt_LOAD
8293
+ #define GGML_F32X_STORE GGML_F32xt_STORE
8294
+ #define GGML_F32X_MUL GGML_F32xt_MUL
8295
+ #define GGML_F32X_FMA GGML_F32xt_FMA
8296
+ #define GLA_VECTOR_SIZE 8
8297
  #elif defined(__ARM_NEON) && defined(__aarch64__)
8298
  #define GGML_F32X GGML_F32x4
8299
  #define GGML_F32X_SET1 GGML_F32x4_SET1
 
8304
  #define GLA_VECTOR_SIZE 4
8305
  #endif
8306
 
8307
+ int gla_vector_size;
8308
  #ifdef GLA_VECTOR_SIZE
8309
+ #if defined(__ARM_FEATURE_SVE)
8310
+ gla_vector_size = svcntw();
8311
+ #else
8312
+ gla_vector_size = GLA_VECTOR_SIZE;
8313
+ #endif
8314
+ const int64_t vec_count = head_size / gla_vector_size;
8315
 
8316
  for (int64_t t = 0; t < T; t++) {
8317
  size_t t_offset = t * t_stride;
 
8338
  GGML_F32X g_vec = GGML_F32X_SET1(g_val);
8339
 
8340
  for (int64_t j = 0; j < vec_count; j++) {
8341
+ size_t base_j = j * gla_vector_size;
8342
  size_t t_h_j_offset = t_h_offset + base_j;
8343
  size_t h_2d_i_j_offset = h_2d_i_offset + base_j;
8344
 
 
8362
  }
8363
 
8364
  // Handle remaining elements, this will not be used.
8365
+ for (int64_t j = vec_count * gla_vector_size; j < head_size; j++) {
8366
  size_t t_h_j_offset = t_h_offset + j;
8367
  size_t h_2d_i_j_offset = h_2d_i_offset + j;
8368
  float v_val = v[t_h_j_offset];
 
8471
  int64_t h_stride_2d = head_size * head_size;
8472
 
8473
  #if defined(GGML_SIMD)
8474
+ #if defined(__ARM_FEATURE_SVE)
8475
+ // scalar Route to scalar implementation //TODO: Write SVE code
8476
+ for (int64_t t = 0; t < T; t++) {
8477
+ int64_t t_offset = t * t_stride;
8478
+ int64_t state_offset = head_size * C * (t / (T / n_seqs));
8479
+ float * state_cur = state + state_offset;
8480
+ float * state_prev = t % (T / n_seqs) ? state_cur : (float*)dst->src[6]->data + state_offset;
8481
+
8482
+ for (int64_t h = h_start; h < h_end; h++) {
8483
+ int64_t h_offset = h * h_stride;
8484
+ int64_t t_h_offset = t_offset + h_offset;
8485
+ int64_t h_2d_offset = h * h_stride_2d;
8486
+
8487
+ for (int64_t i = 0; i < head_size; i++) {
8488
+ int64_t t_h_i_offset = t_h_offset + i;
8489
+ int64_t h_2d_i_offset = h_2d_offset + i * h_stride;
8490
+
8491
+ float v_val = v[t_h_i_offset];
8492
+
8493
+ float sa = 0, result = 0;
8494
+ for (int64_t j = 0; j < head_size; j++) {
8495
+ sa += a[t_h_offset + j] * state_prev[h_2d_i_offset + j];
8496
+ }
8497
 
8498
+ for (int64_t j = 0; j < head_size; j++) {
8499
+ int64_t t_h_j_offset = t_h_offset + j;
8500
+ int64_t h_2d_i_j_offset = h_2d_i_offset + j;
8501
+
8502
+ float r_val = r[t_h_j_offset];
8503
+ float w_val = w[t_h_j_offset];
8504
+ float k_val = k[t_h_j_offset];
8505
+ float b_val = b[t_h_j_offset];
8506
+ float kv_val = v_val * k_val;
8507
+ float prev_state_val = state_prev[h_2d_i_j_offset];
8508
+ state_cur[h_2d_i_j_offset] = prev_state_val * w_val + kv_val + sa * b_val;
8509
+ result += state_cur[h_2d_i_j_offset] * r_val;
8510
  }
8511
+ dst_data[t_h_i_offset] = result;
8512
  }
8513
+ }
8514
+ }
8515
+ #else
8516
+ for (int64_t t = 0; t < T; t++) {
8517
+ int64_t t_offset = t * t_stride;
8518
+ int64_t state_offset = head_size * C * (t / (T / n_seqs));
8519
+ float * state_cur = state + state_offset;
8520
+ float * state_prev = t % (T / n_seqs) ? state_cur : (float*)dst->src[6]->data + state_offset;
8521
+
8522
+ for (int64_t h = h_start; h < h_end; h++) {
8523
+ int64_t h_offset = h * h_stride;
8524
+ int64_t t_h_offset = t_offset + h_offset;
8525
+ int64_t h_2d_offset = h * h_stride_2d;
8526
+
8527
+ for (int64_t ii = 0; ii < head_size; ii++) {
8528
+ int64_t t_h_i_offset = t_h_offset + ii;
8529
+ int64_t h_2d_i_offset = h_2d_offset + ii * h_stride;
8530
+
8531
+ GGML_F32_VEC v_vec = GGML_F32_VEC_SET1(v[t_h_i_offset]);
8532
+
8533
+ float sa = 0;
8534
+ {
8535
+ GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
8536
+ GGML_F32_VEC ax[GGML_F32_ARR];
8537
+ GGML_F32_VEC ay[GGML_F32_ARR];
8538
+ for (int64_t j = 0; j < head_size; j += GGML_F32_STEP) {
8539
+ for (int64_t kk = 0; kk < GGML_F32_ARR; kk++) {
8540
+ ax[kk] = GGML_F32_VEC_LOAD(&a[t_h_offset + j + kk * GGML_F32_EPR]);
8541
+ ay[kk] = GGML_F32_VEC_LOAD(&state_prev[h_2d_i_offset + j + kk * GGML_F32_EPR]);
8542
+ sum[kk] = GGML_F32_VEC_FMA(sum[kk], ax[kk], ay[kk]);
8543
+ }
8544
+ }
8545
+ GGML_F32_VEC_REDUCE(sa, sum);
8546
+ }
8547
 
8548
+ GGML_F32_VEC sa_vec = GGML_F32_VEC_SET1(sa);
8549
 
8550
+ int64_t j = 0;
8551
+ GGML_F32_VEC result_vec[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
8552
+ for (; j < head_size; j += GGML_F32_STEP) {
8553
+ for (int64_t kk = 0; kk < GGML_F32_ARR; kk++) {
8554
+ int64_t t_h_j_offset = t_h_offset + j + kk * GGML_F32_EPR;
8555
+ int64_t h_2d_i_j_offset = h_2d_i_offset + j + kk * GGML_F32_EPR;
8556
 
8557
+ GGML_F32_VEC r_vec = GGML_F32_VEC_LOAD(&r[t_h_j_offset]);
8558
+ GGML_F32_VEC w_vec = GGML_F32_VEC_LOAD(&w[t_h_j_offset]);
8559
+ GGML_F32_VEC k_vec = GGML_F32_VEC_LOAD(&k[t_h_j_offset]);
8560
+ GGML_F32_VEC b_vec = GGML_F32_VEC_LOAD(&b[t_h_j_offset]);
8561
 
8562
+ k_vec = GGML_F32_VEC_MUL(v_vec, k_vec);
8563
 
8564
+ GGML_F32_VEC state_vec = GGML_F32_VEC_LOAD(&state_prev[h_2d_i_j_offset]);
8565
+ // kv + s * decay + sa * b
8566
+ state_vec = GGML_F32_VEC_FMA(k_vec, state_vec, w_vec);
8567
+ state_vec = GGML_F32_VEC_FMA(state_vec, sa_vec, b_vec);
8568
+ GGML_F32_VEC_STORE(&state_cur[h_2d_i_j_offset], state_vec);
8569
 
8570
+ result_vec[kk] = GGML_F32_VEC_FMA(result_vec[kk], state_vec, r_vec);
8571
+ }
8572
+ }
8573
+ GGML_F32_VEC_REDUCE(dst_data[t_h_i_offset], result_vec);
8574
+
8575
+ // There shouldn't be left-overs though.
8576
+ for (; j < head_size; j++) {
8577
+ int64_t t_h_j_offset = t_h_offset + j;
8578
+ int64_t h_2d_i_j_offset = h_2d_i_offset + j;
8579
+
8580
+ float r_val = r[t_h_j_offset];
8581
+ float w_val = w[t_h_j_offset];
8582
+ float k_val = k[t_h_j_offset];
8583
+ float b_val = b[t_h_j_offset];
8584
+ float kv_val = v[t_h_i_offset] * k_val;
8585
+
8586
+ float prev_state_val = state_prev[h_2d_i_j_offset];
8587
+ state_cur[h_2d_i_j_offset] = prev_state_val * w_val + kv_val + sa * b_val;
8588
+ dst_data[t_h_i_offset] += state_cur[h_2d_i_j_offset] * r_val;
8589
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8590
  }
8591
  }
8592
  }
8593
+ #endif
8594
  #else
8595
  for (int64_t t = 0; t < T; t++) {
8596
  int64_t t_offset = t * t_stride;
ggml/src/ggml-cpu/simd-mappings.h CHANGED
@@ -17,7 +17,123 @@
17
  // number of elements to fit in a single register
18
  //
19
 
20
- #if defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
  #define GGML_SIMD
23
 
 
17
  // number of elements to fit in a single register
18
  //
19
 
20
+ #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_FMA)
21
+
22
+ #define GGML_SIMD
23
+
24
+ // F32 SVE
25
+ #define GGML_F32_EPR 8
26
+ #define DEFAULT_PG svptrue_b32()
27
+
28
+ #define GGML_F32xt svfloat32_t
29
+ #define GGML_F32xt_ZERO svdup_n_f32(0.0f)
30
+ #define GGML_F32xt_SET1(x) svdup_n_f32(x)
31
+ #define GGML_F32xt_LOAD_IMPL(pg, a, ...) svld1_f32(pg, a)
32
+ #define GGML_F32xt_LOAD(...) GGML_F32xt_LOAD_IMPL(DEFAULT_PG, __VA_ARGS__)
33
+ #define GGML_F32xt_STORE_IMPL(pg,a,b) svst1_f32(pg, a, b)
34
+ #define GGML_F32xt_STORE(...) GGML_F32xt_STORE_IMPL(DEFAULT_PG, __VA_ARGS__)
35
+ #define GGML_F32xt_FMA_IMPL(pg, a, b, c) svmad_f32_m(pg, a, b, c)
36
+ #define GGML_F32xt_FMA(...) GGML_F32xt_FMA_IMPL(DEFAULT_PG, __VA_ARGS__)
37
+ #define GGML_F32xt_ADD_IMPL(pg, a, b) svadd_f32_m(pg, a, b)
38
+ #define GGML_F32xt_ADD(...) GGML_F32xt_ADD_IMPL(DEFAULT_PG, __VA_ARGS__)
39
+ #define GGML_F32xt_MUL_IMPL(pg, a, b) svmul_f32_m(pg, a, b)
40
+ #define GGML_F32xt_MUL(...) GGML_F32xt_MUL_IMPL(DEFAULT_PG, __VA_ARGS__)
41
+ #define GGML_F32xt_REDUCE_ONE_IMPL(pg, a) svaddv(pg, a)
42
+ #define GGML_F32xt_REDUCE_ONE(...) GGML_F32xt_REDUCE_ONE_IMPL(DEFAULT_PG, __VA_ARGS__)
43
+ #define GGML_F32xt_REDUCE_IMPL(pg, res, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8) \
44
+ { \
45
+ sum1 = svadd_f32_m(DEFAULT_PG, sum1, sum2); \
46
+ sum3 = svadd_f32_m(DEFAULT_PG, sum3, sum4); \
47
+ sum5 = svadd_f32_m(DEFAULT_PG, sum5, sum6); \
48
+ sum7 = svadd_f32_m(DEFAULT_PG, sum7, sum8); \
49
+ sum1 = svadd_f32_m(DEFAULT_PG, sum1, sum3); \
50
+ sum5 = svadd_f32_m(DEFAULT_PG, sum5, sum7); \
51
+ sum1 = svadd_f32_m(DEFAULT_PG, sum1, sum5); \
52
+ (res) = (ggml_float) GGML_F32xt_REDUCE_ONE(sum1); \
53
+ }
54
+ #define GGML_F32xt_REDUCE(...) GGML_F32xt_REDUCE_IMPL(DEFAULT_PG, __VA_ARGS__)
55
+
56
+ #define GGML_F32_VEC GGML_F32xt
57
+ #define GGML_F32_VEC_ZERO GGML_F32xt_ZERO
58
+ #define GGML_F32_VEC_SET1 GGML_F32xt_SET1
59
+ #define GGML_F32_VEC_LOAD GGML_F32xt_LOAD
60
+ #define GGML_F32_VEC_STORE GGML_F32xt_STORE
61
+ #define GGML_F32_VEC_FMA GGML_F32xt_FMA
62
+ #define GGML_F32_VEC_ADD GGML_F32xt_ADD
63
+ #define GGML_F32_VEC_MUL GGML_F32xt_MUL
64
+ #define GGML_F32_VEC_REDUCE GGML_F32xt_REDUCE
65
+
66
+ // F16 NEON
67
+
68
+ #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
69
+ #define GGML_F16_STEP 32
70
+ #define GGML_F16_EPR 8
71
+
72
+ #define GGML_F16x8 float16x8_t
73
+ #define GGML_F16x8_ZERO vdupq_n_f16(0.0f)
74
+ #define GGML_F16x8_SET1(x) vdupq_n_f16(x)
75
+ #define GGML_F16x8_LOAD(x) vld1q_f16((const __fp16 *)(x))
76
+ #define GGML_F16x8_STORE vst1q_f16
77
+ #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
78
+ #define GGML_F16x8_ADD vaddq_f16
79
+ #define GGML_F16x8_MUL vmulq_f16
80
+ #define GGML_F16x8_REDUCE(res, x) \
81
+ do { \
82
+ int offset = GGML_F16_ARR >> 1; \
83
+ for (int i = 0; i < offset; ++i) { \
84
+ (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
85
+ } \
86
+ offset >>= 1; \
87
+ for (int i = 0; i < offset; ++i) { \
88
+ (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
89
+ } \
90
+ offset >>= 1; \
91
+ for (int i = 0; i < offset; ++i) { \
92
+ (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
93
+ } \
94
+ const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 ((x)[0])); \
95
+ const float32x4_t t1 = vcvt_f32_f16(vget_high_f16((x)[0])); \
96
+ (res) = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1)); \
97
+ } while (0)
98
+
99
+ #define GGML_F16_VEC GGML_F16x8
100
+ #define GGML_F16_VEC_ZERO GGML_F16x8_ZERO
101
+ #define GGML_F16_VEC_SET1 GGML_F16x8_SET1
102
+ #define GGML_F16_VEC_LOAD(p, i) GGML_F16x8_LOAD(p)
103
+ #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((__fp16 *)(p), (r)[i])
104
+ #define GGML_F16_VEC_FMA GGML_F16x8_FMA
105
+ #define GGML_F16_VEC_ADD GGML_F16x8_ADD
106
+ #define GGML_F16_VEC_MUL GGML_F16x8_MUL
107
+ #define GGML_F16_VEC_REDUCE GGML_F16x8_REDUCE
108
+ #else
109
+ // if FP16 vector arithmetic is not supported, we use FP32 instead
110
+ // and take advantage of the vcvt_ functions to convert to/from FP16
111
+
112
+ #define GGML_F16_STEP 16
113
+ #define GGML_F16_EPR 4
114
+
115
+ #define GGML_F32Cx4 float32x4_t
116
+ #define GGML_F32Cx4_ZERO vdupq_n_f32(0.0f)
117
+ #define GGML_F32Cx4_SET1(x) vdupq_n_f32(x)
118
+ #define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16((const __fp16 *)(x)))
119
+ #define GGML_F32Cx4_STORE(x, y) vst1_f16(x, vcvt_f16_f32(y))
120
+ #define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)
121
+ #define GGML_F32Cx4_ADD vaddq_f32
122
+ #define GGML_F32Cx4_MUL vmulq_f32
123
+ #define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE
124
+
125
+ #define GGML_F16_VEC GGML_F32Cx4
126
+ #define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO
127
+ #define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1
128
+ #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p)
129
+ #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE((__fp16 *)(p), r[i])
130
+ #define GGML_F16_VEC_FMA GGML_F32Cx4_FMA
131
+ #define GGML_F16_VEC_ADD GGML_F32Cx4_ADD
132
+ #define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
133
+ #define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
134
+ #endif
135
+
136
+ #elif defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA)
137
 
138
  #define GGML_SIMD
139
 
ggml/src/ggml-cpu/vec.cpp CHANGED
@@ -17,29 +17,98 @@ void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * G
17
 
18
  #if defined(GGML_SIMD)
19
  float sumf = 0.0f;
20
- const int np = (n & ~(GGML_F32_STEP - 1));
21
 
22
- GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
- GGML_F32_VEC ax[GGML_F32_ARR];
25
- GGML_F32_VEC ay[GGML_F32_ARR];
26
 
27
- for (int i = 0; i < np; i += GGML_F32_STEP) {
28
- for (int j = 0; j < GGML_F32_ARR; j++) {
29
- ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
30
- ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
31
 
32
- sum[j] = GGML_F32_VEC_FMA(sum[j], ax[j], ay[j]);
 
 
 
 
 
 
33
  }
34
- }
35
 
36
- // reduce sum0..sum3 to sum0
37
- GGML_F32_VEC_REDUCE(sumf, sum);
38
 
39
- // leftovers
40
- for (int i = np; i < n; ++i) {
41
- sumf += x[i]*y[i];
42
- }
 
43
  #else
44
  // scalar
45
  ggml_float sumf = 0.0;
 
17
 
18
  #if defined(GGML_SIMD)
19
  float sumf = 0.0f;
 
20
 
21
+ #if defined(__ARM_FEATURE_SVE)
22
+ const int sve_register_length = ggml_cpu_get_sve_cnt() * 8;
23
+ const int ggml_f32_epr = sve_register_length / 32;//8;//svcntw(); // SVE128:4, SVE256:8, SVE512:16
24
+ const int ggml_f32_step = 8 * ggml_f32_epr; // choose 8 SVE registers
25
+
26
+ const int np = (n & ~(ggml_f32_step - 1));
27
+ svfloat32_t sum1 = svdup_n_f32(0.0f);
28
+ svfloat32_t sum2 = svdup_n_f32(0.0f);
29
+ svfloat32_t sum3 = svdup_n_f32(0.0f);
30
+ svfloat32_t sum4 = svdup_n_f32(0.0f);
31
+ svfloat32_t sum5 = svdup_n_f32(0.0f);
32
+ svfloat32_t sum6 = svdup_n_f32(0.0f);
33
+ svfloat32_t sum7 = svdup_n_f32(0.0f);
34
+ svfloat32_t sum8 = svdup_n_f32(0.0f);
35
+ svfloat32_t ax1,ax2,ax3,ax4,ax5,ax6,ax7,ax8;
36
+ svfloat32_t ay1,ay2,ay3,ay4,ay5,ay6,ay7,ay8;
37
+ for (int i = 0; i < np; i += ggml_f32_step) {
38
+ ax1 = GGML_F32_VEC_LOAD(x + i);
39
+ ay1 = GGML_F32_VEC_LOAD(y + i);
40
+ sum1 = GGML_F32_VEC_FMA(ax1, ay1, sum1);
41
+
42
+ ax2 = GGML_F32_VEC_LOAD(x + i + 1*ggml_f32_epr);
43
+ ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr);
44
+ sum2 = GGML_F32_VEC_FMA(ax2, ay2, sum2);
45
+
46
+ ax3 = GGML_F32_VEC_LOAD(x + i + 2*ggml_f32_epr);
47
+ ay3 = GGML_F32_VEC_LOAD(y + i + 2*ggml_f32_epr);
48
+ sum3 = GGML_F32_VEC_FMA(ax3, ay3, sum3);
49
+
50
+ ax4 = GGML_F32_VEC_LOAD(x + i + 3*ggml_f32_epr);
51
+ ay4 = GGML_F32_VEC_LOAD(y + i + 3*ggml_f32_epr);
52
+ sum4 = GGML_F32_VEC_FMA(ax4, ay4, sum4);
53
+
54
+ ax5 = GGML_F32_VEC_LOAD(x + i + 4*ggml_f32_epr);
55
+ ay5 = GGML_F32_VEC_LOAD(y + i + 4*ggml_f32_epr);
56
+ sum5 = GGML_F32_VEC_FMA(ax5, ay5, sum5);
57
+
58
+ ax6 = GGML_F32_VEC_LOAD(x + i + 5*ggml_f32_epr);
59
+ ay6 = GGML_F32_VEC_LOAD(y + i + 5*ggml_f32_epr);
60
+ sum6 = GGML_F32_VEC_FMA(ax6, ay6, sum6);
61
+
62
+ ax7 = GGML_F32_VEC_LOAD(x + i + 6*ggml_f32_epr);
63
+ ay7 = GGML_F32_VEC_LOAD(y + i + 6*ggml_f32_epr);
64
+ sum7 = GGML_F32_VEC_FMA(ax7, ay7, sum7);
65
+
66
+ ax8 = GGML_F32_VEC_LOAD(x + i + 7*ggml_f32_epr);
67
+ ay8 = GGML_F32_VEC_LOAD(y + i + 7*ggml_f32_epr);
68
+ sum8 = GGML_F32_VEC_FMA(ax8, ay8, sum8);
69
+ }
70
+ // leftovers
71
+ // Since 8 unrolls are done in above loop, leftovers lie in range [0, ggml_f32_step] which is handled in below loop
72
+ const int np2 = (n & ~(ggml_f32_epr - 1));
73
+ for (int i = np; i < np2; i += ggml_f32_epr) {
74
+ ax1 = GGML_F32_VEC_LOAD(x + i);
75
+ ay1 = GGML_F32_VEC_LOAD(y + i);
76
+ sum1 = GGML_F32_VEC_FMA(ax1, ay1, sum1);
77
+ }
78
+ // maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only
79
+ if (np2 < n) {
80
+ svbool_t pg = svwhilelt_b32(np2, n);
81
+ ax1 = svld1_f32(pg, x + np2);
82
+ ay1 = svld1_f32(pg, y + np2);
83
+ sum1 = svmad_f32_m(pg, ax1, ay1, sum1);
84
+ }
85
+ // reduce sum1,sum2 to sum1
86
+ GGML_F32_VEC_REDUCE(sumf, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8);
87
+ #else
88
+ const int np = (n & ~(GGML_F32_STEP - 1));
89
 
90
+ GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
 
91
 
92
+ GGML_F32_VEC ax[GGML_F32_ARR];
93
+ GGML_F32_VEC ay[GGML_F32_ARR];
 
 
94
 
95
+ for (int i = 0; i < np; i += GGML_F32_STEP) {
96
+ for (int j = 0; j < GGML_F32_ARR; j++) {
97
+ ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
98
+ ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
99
+
100
+ sum[j] = GGML_F32_VEC_FMA(sum[j], ax[j], ay[j]);
101
+ }
102
  }
 
103
 
104
+ // reduce sum0..sum3 to sum0
105
+ GGML_F32_VEC_REDUCE(sumf, sum);
106
 
107
+ // leftovers
108
+ for (int i = np; i < n; ++i) {
109
+ sumf += x[i]*y[i];
110
+ }
111
+ #endif
112
  #else
113
  // scalar
114
  ggml_float sumf = 0.0;
ggml/src/ggml-cpu/vec.h CHANGED
@@ -5,6 +5,7 @@
5
  #include "ggml-impl.h"
6
  #include "simd-mappings.h"
7
  #include "ggml.h"
 
8
 
9
  #if defined(GGML_USE_ACCELERATE)
10
  #include <Accelerate/Accelerate.h>
@@ -148,27 +149,108 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
148
 
149
  inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const float * GGML_RESTRICT x, const float v) {
150
  #if defined(GGML_SIMD)
151
- const int np = (n & ~(GGML_F32_STEP - 1));
152
 
153
- GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
 
 
 
154
 
155
- GGML_F32_VEC ax[GGML_F32_ARR];
156
- GGML_F32_VEC ay[GGML_F32_ARR];
 
 
157
 
158
- for (int i = 0; i < np; i += GGML_F32_STEP) {
159
- for (int j = 0; j < GGML_F32_ARR; j++) {
160
- ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
161
- ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
162
- ay[j] = GGML_F32_VEC_FMA(ay[j], ax[j], vx);
163
 
164
- GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
  }
166
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
167
 
168
- // leftovers
169
- for (int i = np; i < n; ++i) {
170
- y[i] += x[i]*v;
171
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
172
  #else
173
  // scalar
174
  for (int i = 0; i < n; ++i) {
@@ -220,36 +302,45 @@ inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int
220
  }
221
 
222
  #if defined(GGML_SIMD)
223
- const int np = (n & ~(GGML_F32_STEP - 1));
 
 
 
 
 
 
 
 
224
 
225
- GGML_F32_VEC vx[GGML_VEC_MAD_UNROLL];
226
 
227
- for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
228
- vx[k] = GGML_F32_VEC_SET1(v[k][0]);
229
- }
230
 
231
- GGML_F32_VEC ax[GGML_VEC_MAD_UNROLL][GGML_F32_ARR];
232
- GGML_F32_VEC ay[GGML_F32_ARR];
233
 
234
- for (int i = 0; i < np; i += GGML_F32_STEP) {
235
- for (int j = 0; j < GGML_F32_ARR; j++) {
236
- ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
237
 
238
- for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
239
- ax[k][j] = GGML_F32_VEC_LOAD(x[k] + i + j*GGML_F32_EPR);
240
- ay[j] = GGML_F32_VEC_FMA(ay[j], ax[k][j], vx[k]);
241
- }
242
 
243
- GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
 
244
  }
245
- }
246
 
247
- // leftovers
248
- for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
249
- for (int i = np; i < n; ++i) {
250
- y[i] += x[k][i]*v[k][0];
 
251
  }
252
- }
253
  #else
254
  // scalar
255
  for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
@@ -265,25 +356,53 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
265
  #if defined(GGML_USE_ACCELERATE)
266
  vDSP_vsmul(y, 1, &v, y, 1, n);
267
  #elif defined(GGML_SIMD)
268
- const int np = (n & ~(GGML_F32_STEP - 1));
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
269
 
270
- GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
271
 
272
- GGML_F32_VEC ay[GGML_F32_ARR];
273
 
274
- for (int i = 0; i < np; i += GGML_F32_STEP) {
275
- for (int j = 0; j < GGML_F32_ARR; j++) {
276
- ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
277
- ay[j] = GGML_F32_VEC_MUL(ay[j], vx);
278
 
279
- GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
 
280
  }
281
- }
282
 
283
- // leftovers
284
- for (int i = np; i < n; ++i) {
285
- y[i] *= v;
286
- }
 
287
  #else
288
  // scalar
289
  for (int i = 0; i < n; ++i) {
 
5
  #include "ggml-impl.h"
6
  #include "simd-mappings.h"
7
  #include "ggml.h"
8
+ #include "ggml-cpu.h"
9
 
10
  #if defined(GGML_USE_ACCELERATE)
11
  #include <Accelerate/Accelerate.h>
 
149
 
150
  inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const float * GGML_RESTRICT x, const float v) {
151
  #if defined(GGML_SIMD)
152
+ #if defined(__ARM_FEATURE_SVE)
153
 
154
+ const int sve_register_length = ggml_cpu_get_sve_cnt() * 8;
155
+ const int ggml_f32_epr = sve_register_length / 32;//8;//svcntw(); // SVE128:4, SVE256:8, SVE512:16
156
+ const int ggml_f32_step = 8 * ggml_f32_epr; // choose 8 SVE registers
157
+ GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
158
 
159
+ const int np = (n & ~(ggml_f32_step - 1));
160
+ svfloat32_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
161
+ svfloat32_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
162
+ for (int i = 0; i < np; i += ggml_f32_step) {
163
 
164
+ ax1 = GGML_F32_VEC_LOAD(x + i);
165
+ ay1 = GGML_F32_VEC_LOAD(y + i);
166
+ ay1 = GGML_F32_VEC_FMA(ax1, vx, ay1);
 
 
167
 
168
+ GGML_F32_VEC_STORE(y + i, ay1);
169
+
170
+ ax2 = GGML_F32_VEC_LOAD(x + i + 1*ggml_f32_epr);
171
+ ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr);
172
+ ay2 = GGML_F32_VEC_FMA(ax2, vx, ay2);
173
+
174
+ GGML_F32_VEC_STORE(y + i + 1*ggml_f32_epr, ay2);
175
+
176
+ ax3 = GGML_F32_VEC_LOAD(x + i + 2*ggml_f32_epr);
177
+ ay3 = GGML_F32_VEC_LOAD(y + i + 2*ggml_f32_epr);
178
+ ay3 = GGML_F32_VEC_FMA(ax3, vx, ay3);
179
+
180
+ GGML_F32_VEC_STORE(y + i + 2*ggml_f32_epr, ay3);
181
+
182
+ ax4 = GGML_F32_VEC_LOAD(x + i + 3*ggml_f32_epr);
183
+ ay4 = GGML_F32_VEC_LOAD(y + i + 3*ggml_f32_epr);
184
+ ay4 = GGML_F32_VEC_FMA(ax4, vx, ay4);
185
+
186
+ GGML_F32_VEC_STORE(y + i + 3*ggml_f32_epr, ay4);
187
+
188
+ ax5 = GGML_F32_VEC_LOAD(x + i + 4*ggml_f32_epr);
189
+ ay5 = GGML_F32_VEC_LOAD(y + i + 4*ggml_f32_epr);
190
+ ay5 = GGML_F32_VEC_FMA(ax5, vx, ay5);
191
+
192
+ GGML_F32_VEC_STORE(y + i + 4*ggml_f32_epr, ay5);
193
+
194
+ ax6 = GGML_F32_VEC_LOAD(x + i + 5*ggml_f32_epr);
195
+ ay6 = GGML_F32_VEC_LOAD(y + i + 5*ggml_f32_epr);
196
+ ay6 = GGML_F32_VEC_FMA(ax6, vx, ay6);
197
+
198
+ GGML_F32_VEC_STORE(y + i + 5*ggml_f32_epr, ay6);
199
+
200
+ ax7 = GGML_F32_VEC_LOAD(x + i + 6*ggml_f32_epr);
201
+ ay7 = GGML_F32_VEC_LOAD(y + i + 6*ggml_f32_epr);
202
+ ay7 = GGML_F32_VEC_FMA(ax7, vx, ay7);
203
+
204
+ GGML_F32_VEC_STORE(y + i + 6*ggml_f32_epr, ay7);
205
+
206
+ ax8 = GGML_F32_VEC_LOAD(x + i + 7*ggml_f32_epr);
207
+ ay8 = GGML_F32_VEC_LOAD(y + i + 7*ggml_f32_epr);
208
+ ay8 = GGML_F32_VEC_FMA(ax8, vx, ay8);
209
+
210
+ GGML_F32_VEC_STORE(y + i + 7*ggml_f32_epr, ay8);
211
  }
212
+ // leftovers
213
+ // Since 8 unrolls are done in above loop, leftovers lie in range [0, ggml_f32_step] which is handled in below loop
214
+ const int np2 = (n & ~(ggml_f32_epr - 1));
215
+ for (int i = np; i < np2; i += ggml_f32_epr) {
216
+ ax1 = GGML_F32_VEC_LOAD(x + i);
217
+ ay1 = GGML_F32_VEC_LOAD(y + i);
218
+ ay1 = GGML_F32_VEC_FMA(ax1, vx, ay1);
219
+
220
+ GGML_F32_VEC_STORE(y + i, ay1);
221
+ }
222
+ // maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only
223
+ if (np2 < n) {
224
+ svbool_t pg =svwhilelt_b32(np2, n);
225
+ ax1 = svld1_f32(pg, x + np2);
226
+ ay1 = svld1_f32(pg, y + np2);
227
+ ay1 = svmad_f32_m(pg, ax1, vx, ay1);
228
+
229
+ svst1_f32(pg, y + np2, ay1);
230
+ }
231
+ #else
232
+ const int np = (n & ~(GGML_F32_STEP - 1));
233
 
234
+ GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
235
+
236
+ GGML_F32_VEC ax[GGML_F32_ARR];
237
+ GGML_F32_VEC ay[GGML_F32_ARR];
238
+
239
+ for (int i = 0; i < np; i += GGML_F32_STEP) {
240
+ for (int j = 0; j < GGML_F32_ARR; j++) {
241
+ ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
242
+ ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
243
+ ay[j] = GGML_F32_VEC_FMA(ay[j], ax[j], vx);
244
+
245
+ GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
246
+ }
247
+ }
248
+
249
+ // leftovers
250
+ for (int i = np; i < n; ++i) {
251
+ y[i] += x[i]*v;
252
+ }
253
+ #endif
254
  #else
255
  // scalar
256
  for (int i = 0; i < n; ++i) {
 
302
  }
303
 
304
  #if defined(GGML_SIMD)
305
+ #if defined(__ARM_FEATURE_SVE)
306
+ // scalar Route to scalar implementation //TODO: Write SVE code
307
+ for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
308
+ for (int i = 0; i < n; ++i) {
309
+ y[i] += x[k][i]*v[k][0];
310
+ }
311
+ }
312
+ #else
313
+ const int np = (n & ~(GGML_F32_STEP - 1));
314
 
315
+ GGML_F32_VEC vx[GGML_VEC_MAD_UNROLL];
316
 
317
+ for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
318
+ vx[k] = GGML_F32_VEC_SET1(v[k][0]);
319
+ }
320
 
321
+ GGML_F32_VEC ax[GGML_VEC_MAD_UNROLL][GGML_F32_ARR];
322
+ GGML_F32_VEC ay[GGML_F32_ARR];
323
 
324
+ for (int i = 0; i < np; i += GGML_F32_STEP) {
325
+ for (int j = 0; j < GGML_F32_ARR; j++) {
326
+ ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
327
 
328
+ for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
329
+ ax[k][j] = GGML_F32_VEC_LOAD(x[k] + i + j*GGML_F32_EPR);
330
+ ay[j] = GGML_F32_VEC_FMA(ay[j], ax[k][j], vx[k]);
331
+ }
332
 
333
+ GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
334
+ }
335
  }
 
336
 
337
+ // leftovers
338
+ for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
339
+ for (int i = np; i < n; ++i) {
340
+ y[i] += x[k][i]*v[k][0];
341
+ }
342
  }
343
+ #endif
344
  #else
345
  // scalar
346
  for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
 
356
  #if defined(GGML_USE_ACCELERATE)
357
  vDSP_vsmul(y, 1, &v, y, 1, n);
358
  #elif defined(GGML_SIMD)
359
+ #if defined(__ARM_FEATURE_SVE)
360
+ const int sve_register_length = ggml_cpu_get_sve_cnt() * 8;
361
+ const int ggml_f32_epr = sve_register_length / 32;//8;//svcntw(); // SVE128:4, SVE256:8, SVE512:16
362
+ const int ggml_f32_step = 2 * ggml_f32_epr;
363
+
364
+ GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
365
+ const int np = (n & ~(ggml_f32_step - 1));
366
+ svfloat32_t ay1;
367
+ svfloat32_t ay2;
368
+ for (int i = 0; i < np; i += ggml_f32_step) {
369
+ ay1 = GGML_F32_VEC_LOAD(y + i);
370
+ ay1 = GGML_F32_VEC_MUL(ay1, vx);
371
+ GGML_F32_VEC_STORE(y + i, ay1);
372
+
373
+ ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr);
374
+ ay2 = GGML_F32_VEC_MUL(ay2, vx);
375
+ GGML_F32_VEC_STORE(y + i + 1*ggml_f32_epr, ay2);
376
+ }
377
+ // leftovers
378
+ // maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only
379
+ if (np < n) {
380
+ svbool_t pg = svwhilelt_b32(np, n);
381
+ ay1 = svld1_f32(pg, y + np);
382
+ ay1 = svmul_f32_m(pg, ay1, vx);
383
+ svst1_f32(pg, y + np, ay1);
384
+ }
385
+ #else
386
+ const int np = (n & ~(GGML_F32_STEP - 1));
387
 
388
+ GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
389
 
390
+ GGML_F32_VEC ay[GGML_F32_ARR];
391
 
392
+ for (int i = 0; i < np; i += GGML_F32_STEP) {
393
+ for (int j = 0; j < GGML_F32_ARR; j++) {
394
+ ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
395
+ ay[j] = GGML_F32_VEC_MUL(ay[j], vx);
396
 
397
+ GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
398
+ }
399
  }
 
400
 
401
+ // leftovers
402
+ for (int i = np; i < n; ++i) {
403
+ y[i] *= v;
404
+ }
405
+ #endif
406
  #else
407
  // scalar
408
  for (int i = 0; i < n; ++i) {