Spaces:
Running
Running
Vineel Abhinav
commited on
Commit
·
7941e9b
1
Parent(s):
d9bd7ce
ggml: aarch64: Implement SVE F32 kernels for vector functions (llama/13843)
Browse files* F32-Mamba-SVE
* F32-Mamba-SVE
* Resolve test errors-1
* Resolve test errors-2
* F32-vec-SVE
* F32-vec-SVE
* F32-vec-SVE
- ggml/src/ggml-cpu/ops.cpp +143 -72
- ggml/src/ggml-cpu/simd-mappings.h +117 -1
- ggml/src/ggml-cpu/vec.cpp +85 -16
- ggml/src/ggml-cpu/vec.h +168 -49
ggml/src/ggml-cpu/ops.cpp
CHANGED
|
@@ -7641,8 +7641,8 @@ static void ggml_compute_forward_ssm_scan_f32(
|
|
| 7641 |
const float * A = (const float *) ((const char *) src3->data + ir0*(src3->nb[1])); // {d_state, d_inner}
|
| 7642 |
const float * B = (const float *) ((const char *) src4->data + i2*(src4->nb[1]) + i3*(src4->nb[2])); // {d_state, n_t, n_s}
|
| 7643 |
const float * C = (const float *) ((const char *) src5->data + i2*(src5->nb[1]) + i3*(src5->nb[2])); // {d_state, n_t, n_s}
|
| 7644 |
-
|
| 7645 |
-
|
| 7646 |
|
| 7647 |
// use the output as the source for the next token-wise iterations
|
| 7648 |
if (i2 > 0) { s0 = s; }
|
|
@@ -8070,6 +8070,14 @@ static void ggml_compute_forward_rwkv_wkv6_f32(
|
|
| 8070 |
#define GGML_F32X_MUL GGML_F32x16_MUL
|
| 8071 |
#define GGML_F32X_FMA GGML_F32x16_FMA
|
| 8072 |
#define WKV_VECTOR_SIZE 16
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8073 |
#elif defined(__ARM_NEON) && defined(__aarch64__)
|
| 8074 |
#define GGML_F32X GGML_F32x4
|
| 8075 |
#define GGML_F32X_SET1 GGML_F32x4_SET1
|
|
@@ -8080,8 +8088,14 @@ static void ggml_compute_forward_rwkv_wkv6_f32(
|
|
| 8080 |
#define WKV_VECTOR_SIZE 4
|
| 8081 |
#endif
|
| 8082 |
|
|
|
|
| 8083 |
#ifdef WKV_VECTOR_SIZE
|
| 8084 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8085 |
|
| 8086 |
for (int64_t t = 0; t < T; t++) {
|
| 8087 |
size_t t_offset = t * t_stride;
|
|
@@ -8111,7 +8125,7 @@ static void ggml_compute_forward_rwkv_wkv6_f32(
|
|
| 8111 |
GGML_F32X time_decay_vec = GGML_F32X_SET1(time_decay_val);
|
| 8112 |
|
| 8113 |
for (int64_t j = 0; j < vec_count; j++) {
|
| 8114 |
-
size_t base_j = j *
|
| 8115 |
size_t t_h_j_offset = t_h_offset + base_j;
|
| 8116 |
size_t h_2d_i_j_offset = h_2d_i_offset + base_j;
|
| 8117 |
|
|
@@ -8136,7 +8150,7 @@ static void ggml_compute_forward_rwkv_wkv6_f32(
|
|
| 8136 |
}
|
| 8137 |
|
| 8138 |
// Handle remaining elements, this will not be used.
|
| 8139 |
-
for (int64_t j = vec_count *
|
| 8140 |
size_t t_h_j_offset = t_h_offset + j;
|
| 8141 |
size_t h_2d_i_j_offset = h_2d_i_offset + j;
|
| 8142 |
float v_val = v[t_h_j_offset];
|
|
@@ -8272,6 +8286,14 @@ static void ggml_compute_forward_gla_f32(
|
|
| 8272 |
#define GGML_F32X_MUL GGML_F32x16_MUL
|
| 8273 |
#define GGML_F32X_FMA GGML_F32x16_FMA
|
| 8274 |
#define GLA_VECTOR_SIZE 16
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8275 |
#elif defined(__ARM_NEON) && defined(__aarch64__)
|
| 8276 |
#define GGML_F32X GGML_F32x4
|
| 8277 |
#define GGML_F32X_SET1 GGML_F32x4_SET1
|
|
@@ -8282,8 +8304,14 @@ static void ggml_compute_forward_gla_f32(
|
|
| 8282 |
#define GLA_VECTOR_SIZE 4
|
| 8283 |
#endif
|
| 8284 |
|
|
|
|
| 8285 |
#ifdef GLA_VECTOR_SIZE
|
| 8286 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8287 |
|
| 8288 |
for (int64_t t = 0; t < T; t++) {
|
| 8289 |
size_t t_offset = t * t_stride;
|
|
@@ -8310,7 +8338,7 @@ static void ggml_compute_forward_gla_f32(
|
|
| 8310 |
GGML_F32X g_vec = GGML_F32X_SET1(g_val);
|
| 8311 |
|
| 8312 |
for (int64_t j = 0; j < vec_count; j++) {
|
| 8313 |
-
size_t base_j = j *
|
| 8314 |
size_t t_h_j_offset = t_h_offset + base_j;
|
| 8315 |
size_t h_2d_i_j_offset = h_2d_i_offset + base_j;
|
| 8316 |
|
|
@@ -8334,7 +8362,7 @@ static void ggml_compute_forward_gla_f32(
|
|
| 8334 |
}
|
| 8335 |
|
| 8336 |
// Handle remaining elements, this will not be used.
|
| 8337 |
-
for (int64_t j = vec_count *
|
| 8338 |
size_t t_h_j_offset = t_h_offset + j;
|
| 8339 |
size_t h_2d_i_j_offset = h_2d_i_offset + j;
|
| 8340 |
float v_val = v[t_h_j_offset];
|
|
@@ -8443,83 +8471,126 @@ static void ggml_compute_forward_rwkv_wkv7_f32(
|
|
| 8443 |
int64_t h_stride_2d = head_size * head_size;
|
| 8444 |
|
| 8445 |
#if defined(GGML_SIMD)
|
| 8446 |
-
|
| 8447 |
-
|
| 8448 |
-
int64_t
|
| 8449 |
-
|
| 8450 |
-
|
| 8451 |
-
|
| 8452 |
-
|
| 8453 |
-
|
| 8454 |
-
int64_t
|
| 8455 |
-
|
| 8456 |
-
|
| 8457 |
-
|
| 8458 |
-
|
| 8459 |
-
int64_t
|
| 8460 |
-
|
| 8461 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8462 |
|
| 8463 |
-
|
| 8464 |
-
|
| 8465 |
-
|
| 8466 |
-
|
| 8467 |
-
|
| 8468 |
-
|
| 8469 |
-
|
| 8470 |
-
|
| 8471 |
-
|
| 8472 |
-
|
| 8473 |
-
|
|
|
|
| 8474 |
}
|
| 8475 |
-
|
| 8476 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8477 |
|
| 8478 |
-
|
| 8479 |
|
| 8480 |
-
|
| 8481 |
-
|
| 8482 |
-
|
| 8483 |
-
|
| 8484 |
-
|
| 8485 |
-
|
| 8486 |
|
| 8487 |
-
|
| 8488 |
-
|
| 8489 |
-
|
| 8490 |
-
|
| 8491 |
|
| 8492 |
-
|
| 8493 |
|
| 8494 |
-
|
| 8495 |
-
|
| 8496 |
-
|
| 8497 |
-
|
| 8498 |
-
|
| 8499 |
|
| 8500 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8501 |
}
|
| 8502 |
-
}
|
| 8503 |
-
GGML_F32_VEC_REDUCE(dst_data[t_h_i_offset], result_vec);
|
| 8504 |
-
|
| 8505 |
-
// There shouldn't be left-overs though.
|
| 8506 |
-
for (; j < head_size; j++) {
|
| 8507 |
-
int64_t t_h_j_offset = t_h_offset + j;
|
| 8508 |
-
int64_t h_2d_i_j_offset = h_2d_i_offset + j;
|
| 8509 |
-
|
| 8510 |
-
float r_val = r[t_h_j_offset];
|
| 8511 |
-
float w_val = w[t_h_j_offset];
|
| 8512 |
-
float k_val = k[t_h_j_offset];
|
| 8513 |
-
float b_val = b[t_h_j_offset];
|
| 8514 |
-
float kv_val = v[t_h_i_offset] * k_val;
|
| 8515 |
-
|
| 8516 |
-
float prev_state_val = state_prev[h_2d_i_j_offset];
|
| 8517 |
-
state_cur[h_2d_i_j_offset] = prev_state_val * w_val + kv_val + sa * b_val;
|
| 8518 |
-
dst_data[t_h_i_offset] += state_cur[h_2d_i_j_offset] * r_val;
|
| 8519 |
}
|
| 8520 |
}
|
| 8521 |
}
|
| 8522 |
-
|
| 8523 |
#else
|
| 8524 |
for (int64_t t = 0; t < T; t++) {
|
| 8525 |
int64_t t_offset = t * t_stride;
|
|
|
|
| 7641 |
const float * A = (const float *) ((const char *) src3->data + ir0*(src3->nb[1])); // {d_state, d_inner}
|
| 7642 |
const float * B = (const float *) ((const char *) src4->data + i2*(src4->nb[1]) + i3*(src4->nb[2])); // {d_state, n_t, n_s}
|
| 7643 |
const float * C = (const float *) ((const char *) src5->data + i2*(src5->nb[1]) + i3*(src5->nb[2])); // {d_state, n_t, n_s}
|
| 7644 |
+
float * y = ( float *) (( char *) dst->data + ir0*(src1->nb[0]) + i2*(src1->nb[1]) + i3*(src1->nb[2])); // {d_inner, n_t, n_s}
|
| 7645 |
+
float * s = ( float *) (( char *) dst->data + ir0*(src0->nb[1]) + i3*(src0->nb[2]) + src1->nb[3]); // {d_state, d_inner, n_s}
|
| 7646 |
|
| 7647 |
// use the output as the source for the next token-wise iterations
|
| 7648 |
if (i2 > 0) { s0 = s; }
|
|
|
|
| 8070 |
#define GGML_F32X_MUL GGML_F32x16_MUL
|
| 8071 |
#define GGML_F32X_FMA GGML_F32x16_FMA
|
| 8072 |
#define WKV_VECTOR_SIZE 16
|
| 8073 |
+
#elif defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
|
| 8074 |
+
#define GGML_F32X GGML_F32xt
|
| 8075 |
+
#define GGML_F32X_SET1 GGML_F32xt_SET1
|
| 8076 |
+
#define GGML_F32X_LOAD GGML_F32xt_LOAD
|
| 8077 |
+
#define GGML_F32X_STORE GGML_F32xt_STORE
|
| 8078 |
+
#define GGML_F32X_MUL GGML_F32xt_MUL
|
| 8079 |
+
#define GGML_F32X_FMA GGML_F32xt_FMA
|
| 8080 |
+
#define WKV_VECTOR_SIZE 8
|
| 8081 |
#elif defined(__ARM_NEON) && defined(__aarch64__)
|
| 8082 |
#define GGML_F32X GGML_F32x4
|
| 8083 |
#define GGML_F32X_SET1 GGML_F32x4_SET1
|
|
|
|
| 8088 |
#define WKV_VECTOR_SIZE 4
|
| 8089 |
#endif
|
| 8090 |
|
| 8091 |
+
int wkv_vector_size;
|
| 8092 |
#ifdef WKV_VECTOR_SIZE
|
| 8093 |
+
#if defined(__ARM_FEATURE_SVE)
|
| 8094 |
+
wkv_vector_size = svcntw();
|
| 8095 |
+
#else
|
| 8096 |
+
wkv_vector_size = WKV_VECTOR_SIZE;
|
| 8097 |
+
#endif
|
| 8098 |
+
const int64_t vec_count = head_size / wkv_vector_size;
|
| 8099 |
|
| 8100 |
for (int64_t t = 0; t < T; t++) {
|
| 8101 |
size_t t_offset = t * t_stride;
|
|
|
|
| 8125 |
GGML_F32X time_decay_vec = GGML_F32X_SET1(time_decay_val);
|
| 8126 |
|
| 8127 |
for (int64_t j = 0; j < vec_count; j++) {
|
| 8128 |
+
size_t base_j = j * wkv_vector_size;
|
| 8129 |
size_t t_h_j_offset = t_h_offset + base_j;
|
| 8130 |
size_t h_2d_i_j_offset = h_2d_i_offset + base_j;
|
| 8131 |
|
|
|
|
| 8150 |
}
|
| 8151 |
|
| 8152 |
// Handle remaining elements, this will not be used.
|
| 8153 |
+
for (int64_t j = vec_count * wkv_vector_size; j < head_size; j++) {
|
| 8154 |
size_t t_h_j_offset = t_h_offset + j;
|
| 8155 |
size_t h_2d_i_j_offset = h_2d_i_offset + j;
|
| 8156 |
float v_val = v[t_h_j_offset];
|
|
|
|
| 8286 |
#define GGML_F32X_MUL GGML_F32x16_MUL
|
| 8287 |
#define GGML_F32X_FMA GGML_F32x16_FMA
|
| 8288 |
#define GLA_VECTOR_SIZE 16
|
| 8289 |
+
#elif defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
|
| 8290 |
+
#define GGML_F32X GGML_F32xt
|
| 8291 |
+
#define GGML_F32X_SET1 GGML_F32xt_SET1
|
| 8292 |
+
#define GGML_F32X_LOAD GGML_F32xt_LOAD
|
| 8293 |
+
#define GGML_F32X_STORE GGML_F32xt_STORE
|
| 8294 |
+
#define GGML_F32X_MUL GGML_F32xt_MUL
|
| 8295 |
+
#define GGML_F32X_FMA GGML_F32xt_FMA
|
| 8296 |
+
#define GLA_VECTOR_SIZE 8
|
| 8297 |
#elif defined(__ARM_NEON) && defined(__aarch64__)
|
| 8298 |
#define GGML_F32X GGML_F32x4
|
| 8299 |
#define GGML_F32X_SET1 GGML_F32x4_SET1
|
|
|
|
| 8304 |
#define GLA_VECTOR_SIZE 4
|
| 8305 |
#endif
|
| 8306 |
|
| 8307 |
+
int gla_vector_size;
|
| 8308 |
#ifdef GLA_VECTOR_SIZE
|
| 8309 |
+
#if defined(__ARM_FEATURE_SVE)
|
| 8310 |
+
gla_vector_size = svcntw();
|
| 8311 |
+
#else
|
| 8312 |
+
gla_vector_size = GLA_VECTOR_SIZE;
|
| 8313 |
+
#endif
|
| 8314 |
+
const int64_t vec_count = head_size / gla_vector_size;
|
| 8315 |
|
| 8316 |
for (int64_t t = 0; t < T; t++) {
|
| 8317 |
size_t t_offset = t * t_stride;
|
|
|
|
| 8338 |
GGML_F32X g_vec = GGML_F32X_SET1(g_val);
|
| 8339 |
|
| 8340 |
for (int64_t j = 0; j < vec_count; j++) {
|
| 8341 |
+
size_t base_j = j * gla_vector_size;
|
| 8342 |
size_t t_h_j_offset = t_h_offset + base_j;
|
| 8343 |
size_t h_2d_i_j_offset = h_2d_i_offset + base_j;
|
| 8344 |
|
|
|
|
| 8362 |
}
|
| 8363 |
|
| 8364 |
// Handle remaining elements, this will not be used.
|
| 8365 |
+
for (int64_t j = vec_count * gla_vector_size; j < head_size; j++) {
|
| 8366 |
size_t t_h_j_offset = t_h_offset + j;
|
| 8367 |
size_t h_2d_i_j_offset = h_2d_i_offset + j;
|
| 8368 |
float v_val = v[t_h_j_offset];
|
|
|
|
| 8471 |
int64_t h_stride_2d = head_size * head_size;
|
| 8472 |
|
| 8473 |
#if defined(GGML_SIMD)
|
| 8474 |
+
#if defined(__ARM_FEATURE_SVE)
|
| 8475 |
+
// scalar Route to scalar implementation //TODO: Write SVE code
|
| 8476 |
+
for (int64_t t = 0; t < T; t++) {
|
| 8477 |
+
int64_t t_offset = t * t_stride;
|
| 8478 |
+
int64_t state_offset = head_size * C * (t / (T / n_seqs));
|
| 8479 |
+
float * state_cur = state + state_offset;
|
| 8480 |
+
float * state_prev = t % (T / n_seqs) ? state_cur : (float*)dst->src[6]->data + state_offset;
|
| 8481 |
+
|
| 8482 |
+
for (int64_t h = h_start; h < h_end; h++) {
|
| 8483 |
+
int64_t h_offset = h * h_stride;
|
| 8484 |
+
int64_t t_h_offset = t_offset + h_offset;
|
| 8485 |
+
int64_t h_2d_offset = h * h_stride_2d;
|
| 8486 |
+
|
| 8487 |
+
for (int64_t i = 0; i < head_size; i++) {
|
| 8488 |
+
int64_t t_h_i_offset = t_h_offset + i;
|
| 8489 |
+
int64_t h_2d_i_offset = h_2d_offset + i * h_stride;
|
| 8490 |
+
|
| 8491 |
+
float v_val = v[t_h_i_offset];
|
| 8492 |
+
|
| 8493 |
+
float sa = 0, result = 0;
|
| 8494 |
+
for (int64_t j = 0; j < head_size; j++) {
|
| 8495 |
+
sa += a[t_h_offset + j] * state_prev[h_2d_i_offset + j];
|
| 8496 |
+
}
|
| 8497 |
|
| 8498 |
+
for (int64_t j = 0; j < head_size; j++) {
|
| 8499 |
+
int64_t t_h_j_offset = t_h_offset + j;
|
| 8500 |
+
int64_t h_2d_i_j_offset = h_2d_i_offset + j;
|
| 8501 |
+
|
| 8502 |
+
float r_val = r[t_h_j_offset];
|
| 8503 |
+
float w_val = w[t_h_j_offset];
|
| 8504 |
+
float k_val = k[t_h_j_offset];
|
| 8505 |
+
float b_val = b[t_h_j_offset];
|
| 8506 |
+
float kv_val = v_val * k_val;
|
| 8507 |
+
float prev_state_val = state_prev[h_2d_i_j_offset];
|
| 8508 |
+
state_cur[h_2d_i_j_offset] = prev_state_val * w_val + kv_val + sa * b_val;
|
| 8509 |
+
result += state_cur[h_2d_i_j_offset] * r_val;
|
| 8510 |
}
|
| 8511 |
+
dst_data[t_h_i_offset] = result;
|
| 8512 |
}
|
| 8513 |
+
}
|
| 8514 |
+
}
|
| 8515 |
+
#else
|
| 8516 |
+
for (int64_t t = 0; t < T; t++) {
|
| 8517 |
+
int64_t t_offset = t * t_stride;
|
| 8518 |
+
int64_t state_offset = head_size * C * (t / (T / n_seqs));
|
| 8519 |
+
float * state_cur = state + state_offset;
|
| 8520 |
+
float * state_prev = t % (T / n_seqs) ? state_cur : (float*)dst->src[6]->data + state_offset;
|
| 8521 |
+
|
| 8522 |
+
for (int64_t h = h_start; h < h_end; h++) {
|
| 8523 |
+
int64_t h_offset = h * h_stride;
|
| 8524 |
+
int64_t t_h_offset = t_offset + h_offset;
|
| 8525 |
+
int64_t h_2d_offset = h * h_stride_2d;
|
| 8526 |
+
|
| 8527 |
+
for (int64_t ii = 0; ii < head_size; ii++) {
|
| 8528 |
+
int64_t t_h_i_offset = t_h_offset + ii;
|
| 8529 |
+
int64_t h_2d_i_offset = h_2d_offset + ii * h_stride;
|
| 8530 |
+
|
| 8531 |
+
GGML_F32_VEC v_vec = GGML_F32_VEC_SET1(v[t_h_i_offset]);
|
| 8532 |
+
|
| 8533 |
+
float sa = 0;
|
| 8534 |
+
{
|
| 8535 |
+
GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
|
| 8536 |
+
GGML_F32_VEC ax[GGML_F32_ARR];
|
| 8537 |
+
GGML_F32_VEC ay[GGML_F32_ARR];
|
| 8538 |
+
for (int64_t j = 0; j < head_size; j += GGML_F32_STEP) {
|
| 8539 |
+
for (int64_t kk = 0; kk < GGML_F32_ARR; kk++) {
|
| 8540 |
+
ax[kk] = GGML_F32_VEC_LOAD(&a[t_h_offset + j + kk * GGML_F32_EPR]);
|
| 8541 |
+
ay[kk] = GGML_F32_VEC_LOAD(&state_prev[h_2d_i_offset + j + kk * GGML_F32_EPR]);
|
| 8542 |
+
sum[kk] = GGML_F32_VEC_FMA(sum[kk], ax[kk], ay[kk]);
|
| 8543 |
+
}
|
| 8544 |
+
}
|
| 8545 |
+
GGML_F32_VEC_REDUCE(sa, sum);
|
| 8546 |
+
}
|
| 8547 |
|
| 8548 |
+
GGML_F32_VEC sa_vec = GGML_F32_VEC_SET1(sa);
|
| 8549 |
|
| 8550 |
+
int64_t j = 0;
|
| 8551 |
+
GGML_F32_VEC result_vec[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
|
| 8552 |
+
for (; j < head_size; j += GGML_F32_STEP) {
|
| 8553 |
+
for (int64_t kk = 0; kk < GGML_F32_ARR; kk++) {
|
| 8554 |
+
int64_t t_h_j_offset = t_h_offset + j + kk * GGML_F32_EPR;
|
| 8555 |
+
int64_t h_2d_i_j_offset = h_2d_i_offset + j + kk * GGML_F32_EPR;
|
| 8556 |
|
| 8557 |
+
GGML_F32_VEC r_vec = GGML_F32_VEC_LOAD(&r[t_h_j_offset]);
|
| 8558 |
+
GGML_F32_VEC w_vec = GGML_F32_VEC_LOAD(&w[t_h_j_offset]);
|
| 8559 |
+
GGML_F32_VEC k_vec = GGML_F32_VEC_LOAD(&k[t_h_j_offset]);
|
| 8560 |
+
GGML_F32_VEC b_vec = GGML_F32_VEC_LOAD(&b[t_h_j_offset]);
|
| 8561 |
|
| 8562 |
+
k_vec = GGML_F32_VEC_MUL(v_vec, k_vec);
|
| 8563 |
|
| 8564 |
+
GGML_F32_VEC state_vec = GGML_F32_VEC_LOAD(&state_prev[h_2d_i_j_offset]);
|
| 8565 |
+
// kv + s * decay + sa * b
|
| 8566 |
+
state_vec = GGML_F32_VEC_FMA(k_vec, state_vec, w_vec);
|
| 8567 |
+
state_vec = GGML_F32_VEC_FMA(state_vec, sa_vec, b_vec);
|
| 8568 |
+
GGML_F32_VEC_STORE(&state_cur[h_2d_i_j_offset], state_vec);
|
| 8569 |
|
| 8570 |
+
result_vec[kk] = GGML_F32_VEC_FMA(result_vec[kk], state_vec, r_vec);
|
| 8571 |
+
}
|
| 8572 |
+
}
|
| 8573 |
+
GGML_F32_VEC_REDUCE(dst_data[t_h_i_offset], result_vec);
|
| 8574 |
+
|
| 8575 |
+
// There shouldn't be left-overs though.
|
| 8576 |
+
for (; j < head_size; j++) {
|
| 8577 |
+
int64_t t_h_j_offset = t_h_offset + j;
|
| 8578 |
+
int64_t h_2d_i_j_offset = h_2d_i_offset + j;
|
| 8579 |
+
|
| 8580 |
+
float r_val = r[t_h_j_offset];
|
| 8581 |
+
float w_val = w[t_h_j_offset];
|
| 8582 |
+
float k_val = k[t_h_j_offset];
|
| 8583 |
+
float b_val = b[t_h_j_offset];
|
| 8584 |
+
float kv_val = v[t_h_i_offset] * k_val;
|
| 8585 |
+
|
| 8586 |
+
float prev_state_val = state_prev[h_2d_i_j_offset];
|
| 8587 |
+
state_cur[h_2d_i_j_offset] = prev_state_val * w_val + kv_val + sa * b_val;
|
| 8588 |
+
dst_data[t_h_i_offset] += state_cur[h_2d_i_j_offset] * r_val;
|
| 8589 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8590 |
}
|
| 8591 |
}
|
| 8592 |
}
|
| 8593 |
+
#endif
|
| 8594 |
#else
|
| 8595 |
for (int64_t t = 0; t < T; t++) {
|
| 8596 |
int64_t t_offset = t * t_stride;
|
ggml/src/ggml-cpu/simd-mappings.h
CHANGED
|
@@ -17,7 +17,123 @@
|
|
| 17 |
// number of elements to fit in a single register
|
| 18 |
//
|
| 19 |
|
| 20 |
-
#if defined(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
#define GGML_SIMD
|
| 23 |
|
|
|
|
| 17 |
// number of elements to fit in a single register
|
| 18 |
//
|
| 19 |
|
| 20 |
+
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_FMA)
|
| 21 |
+
|
| 22 |
+
#define GGML_SIMD
|
| 23 |
+
|
| 24 |
+
// F32 SVE
|
| 25 |
+
#define GGML_F32_EPR 8
|
| 26 |
+
#define DEFAULT_PG svptrue_b32()
|
| 27 |
+
|
| 28 |
+
#define GGML_F32xt svfloat32_t
|
| 29 |
+
#define GGML_F32xt_ZERO svdup_n_f32(0.0f)
|
| 30 |
+
#define GGML_F32xt_SET1(x) svdup_n_f32(x)
|
| 31 |
+
#define GGML_F32xt_LOAD_IMPL(pg, a, ...) svld1_f32(pg, a)
|
| 32 |
+
#define GGML_F32xt_LOAD(...) GGML_F32xt_LOAD_IMPL(DEFAULT_PG, __VA_ARGS__)
|
| 33 |
+
#define GGML_F32xt_STORE_IMPL(pg,a,b) svst1_f32(pg, a, b)
|
| 34 |
+
#define GGML_F32xt_STORE(...) GGML_F32xt_STORE_IMPL(DEFAULT_PG, __VA_ARGS__)
|
| 35 |
+
#define GGML_F32xt_FMA_IMPL(pg, a, b, c) svmad_f32_m(pg, a, b, c)
|
| 36 |
+
#define GGML_F32xt_FMA(...) GGML_F32xt_FMA_IMPL(DEFAULT_PG, __VA_ARGS__)
|
| 37 |
+
#define GGML_F32xt_ADD_IMPL(pg, a, b) svadd_f32_m(pg, a, b)
|
| 38 |
+
#define GGML_F32xt_ADD(...) GGML_F32xt_ADD_IMPL(DEFAULT_PG, __VA_ARGS__)
|
| 39 |
+
#define GGML_F32xt_MUL_IMPL(pg, a, b) svmul_f32_m(pg, a, b)
|
| 40 |
+
#define GGML_F32xt_MUL(...) GGML_F32xt_MUL_IMPL(DEFAULT_PG, __VA_ARGS__)
|
| 41 |
+
#define GGML_F32xt_REDUCE_ONE_IMPL(pg, a) svaddv(pg, a)
|
| 42 |
+
#define GGML_F32xt_REDUCE_ONE(...) GGML_F32xt_REDUCE_ONE_IMPL(DEFAULT_PG, __VA_ARGS__)
|
| 43 |
+
#define GGML_F32xt_REDUCE_IMPL(pg, res, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8) \
|
| 44 |
+
{ \
|
| 45 |
+
sum1 = svadd_f32_m(DEFAULT_PG, sum1, sum2); \
|
| 46 |
+
sum3 = svadd_f32_m(DEFAULT_PG, sum3, sum4); \
|
| 47 |
+
sum5 = svadd_f32_m(DEFAULT_PG, sum5, sum6); \
|
| 48 |
+
sum7 = svadd_f32_m(DEFAULT_PG, sum7, sum8); \
|
| 49 |
+
sum1 = svadd_f32_m(DEFAULT_PG, sum1, sum3); \
|
| 50 |
+
sum5 = svadd_f32_m(DEFAULT_PG, sum5, sum7); \
|
| 51 |
+
sum1 = svadd_f32_m(DEFAULT_PG, sum1, sum5); \
|
| 52 |
+
(res) = (ggml_float) GGML_F32xt_REDUCE_ONE(sum1); \
|
| 53 |
+
}
|
| 54 |
+
#define GGML_F32xt_REDUCE(...) GGML_F32xt_REDUCE_IMPL(DEFAULT_PG, __VA_ARGS__)
|
| 55 |
+
|
| 56 |
+
#define GGML_F32_VEC GGML_F32xt
|
| 57 |
+
#define GGML_F32_VEC_ZERO GGML_F32xt_ZERO
|
| 58 |
+
#define GGML_F32_VEC_SET1 GGML_F32xt_SET1
|
| 59 |
+
#define GGML_F32_VEC_LOAD GGML_F32xt_LOAD
|
| 60 |
+
#define GGML_F32_VEC_STORE GGML_F32xt_STORE
|
| 61 |
+
#define GGML_F32_VEC_FMA GGML_F32xt_FMA
|
| 62 |
+
#define GGML_F32_VEC_ADD GGML_F32xt_ADD
|
| 63 |
+
#define GGML_F32_VEC_MUL GGML_F32xt_MUL
|
| 64 |
+
#define GGML_F32_VEC_REDUCE GGML_F32xt_REDUCE
|
| 65 |
+
|
| 66 |
+
// F16 NEON
|
| 67 |
+
|
| 68 |
+
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
|
| 69 |
+
#define GGML_F16_STEP 32
|
| 70 |
+
#define GGML_F16_EPR 8
|
| 71 |
+
|
| 72 |
+
#define GGML_F16x8 float16x8_t
|
| 73 |
+
#define GGML_F16x8_ZERO vdupq_n_f16(0.0f)
|
| 74 |
+
#define GGML_F16x8_SET1(x) vdupq_n_f16(x)
|
| 75 |
+
#define GGML_F16x8_LOAD(x) vld1q_f16((const __fp16 *)(x))
|
| 76 |
+
#define GGML_F16x8_STORE vst1q_f16
|
| 77 |
+
#define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
|
| 78 |
+
#define GGML_F16x8_ADD vaddq_f16
|
| 79 |
+
#define GGML_F16x8_MUL vmulq_f16
|
| 80 |
+
#define GGML_F16x8_REDUCE(res, x) \
|
| 81 |
+
do { \
|
| 82 |
+
int offset = GGML_F16_ARR >> 1; \
|
| 83 |
+
for (int i = 0; i < offset; ++i) { \
|
| 84 |
+
(x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
|
| 85 |
+
} \
|
| 86 |
+
offset >>= 1; \
|
| 87 |
+
for (int i = 0; i < offset; ++i) { \
|
| 88 |
+
(x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
|
| 89 |
+
} \
|
| 90 |
+
offset >>= 1; \
|
| 91 |
+
for (int i = 0; i < offset; ++i) { \
|
| 92 |
+
(x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
|
| 93 |
+
} \
|
| 94 |
+
const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 ((x)[0])); \
|
| 95 |
+
const float32x4_t t1 = vcvt_f32_f16(vget_high_f16((x)[0])); \
|
| 96 |
+
(res) = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1)); \
|
| 97 |
+
} while (0)
|
| 98 |
+
|
| 99 |
+
#define GGML_F16_VEC GGML_F16x8
|
| 100 |
+
#define GGML_F16_VEC_ZERO GGML_F16x8_ZERO
|
| 101 |
+
#define GGML_F16_VEC_SET1 GGML_F16x8_SET1
|
| 102 |
+
#define GGML_F16_VEC_LOAD(p, i) GGML_F16x8_LOAD(p)
|
| 103 |
+
#define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((__fp16 *)(p), (r)[i])
|
| 104 |
+
#define GGML_F16_VEC_FMA GGML_F16x8_FMA
|
| 105 |
+
#define GGML_F16_VEC_ADD GGML_F16x8_ADD
|
| 106 |
+
#define GGML_F16_VEC_MUL GGML_F16x8_MUL
|
| 107 |
+
#define GGML_F16_VEC_REDUCE GGML_F16x8_REDUCE
|
| 108 |
+
#else
|
| 109 |
+
// if FP16 vector arithmetic is not supported, we use FP32 instead
|
| 110 |
+
// and take advantage of the vcvt_ functions to convert to/from FP16
|
| 111 |
+
|
| 112 |
+
#define GGML_F16_STEP 16
|
| 113 |
+
#define GGML_F16_EPR 4
|
| 114 |
+
|
| 115 |
+
#define GGML_F32Cx4 float32x4_t
|
| 116 |
+
#define GGML_F32Cx4_ZERO vdupq_n_f32(0.0f)
|
| 117 |
+
#define GGML_F32Cx4_SET1(x) vdupq_n_f32(x)
|
| 118 |
+
#define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16((const __fp16 *)(x)))
|
| 119 |
+
#define GGML_F32Cx4_STORE(x, y) vst1_f16(x, vcvt_f16_f32(y))
|
| 120 |
+
#define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)
|
| 121 |
+
#define GGML_F32Cx4_ADD vaddq_f32
|
| 122 |
+
#define GGML_F32Cx4_MUL vmulq_f32
|
| 123 |
+
#define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE
|
| 124 |
+
|
| 125 |
+
#define GGML_F16_VEC GGML_F32Cx4
|
| 126 |
+
#define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO
|
| 127 |
+
#define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1
|
| 128 |
+
#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p)
|
| 129 |
+
#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE((__fp16 *)(p), r[i])
|
| 130 |
+
#define GGML_F16_VEC_FMA GGML_F32Cx4_FMA
|
| 131 |
+
#define GGML_F16_VEC_ADD GGML_F32Cx4_ADD
|
| 132 |
+
#define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
|
| 133 |
+
#define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
|
| 134 |
+
#endif
|
| 135 |
+
|
| 136 |
+
#elif defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA)
|
| 137 |
|
| 138 |
#define GGML_SIMD
|
| 139 |
|
ggml/src/ggml-cpu/vec.cpp
CHANGED
|
@@ -17,29 +17,98 @@ void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * G
|
|
| 17 |
|
| 18 |
#if defined(GGML_SIMD)
|
| 19 |
float sumf = 0.0f;
|
| 20 |
-
const int np = (n & ~(GGML_F32_STEP - 1));
|
| 21 |
|
| 22 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
-
|
| 25 |
-
GGML_F32_VEC ay[GGML_F32_ARR];
|
| 26 |
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
|
| 30 |
-
ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
|
| 31 |
|
| 32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
}
|
| 34 |
-
}
|
| 35 |
|
| 36 |
-
|
| 37 |
-
|
| 38 |
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
|
|
|
| 43 |
#else
|
| 44 |
// scalar
|
| 45 |
ggml_float sumf = 0.0;
|
|
|
|
| 17 |
|
| 18 |
#if defined(GGML_SIMD)
|
| 19 |
float sumf = 0.0f;
|
|
|
|
| 20 |
|
| 21 |
+
#if defined(__ARM_FEATURE_SVE)
|
| 22 |
+
const int sve_register_length = ggml_cpu_get_sve_cnt() * 8;
|
| 23 |
+
const int ggml_f32_epr = sve_register_length / 32;//8;//svcntw(); // SVE128:4, SVE256:8, SVE512:16
|
| 24 |
+
const int ggml_f32_step = 8 * ggml_f32_epr; // choose 8 SVE registers
|
| 25 |
+
|
| 26 |
+
const int np = (n & ~(ggml_f32_step - 1));
|
| 27 |
+
svfloat32_t sum1 = svdup_n_f32(0.0f);
|
| 28 |
+
svfloat32_t sum2 = svdup_n_f32(0.0f);
|
| 29 |
+
svfloat32_t sum3 = svdup_n_f32(0.0f);
|
| 30 |
+
svfloat32_t sum4 = svdup_n_f32(0.0f);
|
| 31 |
+
svfloat32_t sum5 = svdup_n_f32(0.0f);
|
| 32 |
+
svfloat32_t sum6 = svdup_n_f32(0.0f);
|
| 33 |
+
svfloat32_t sum7 = svdup_n_f32(0.0f);
|
| 34 |
+
svfloat32_t sum8 = svdup_n_f32(0.0f);
|
| 35 |
+
svfloat32_t ax1,ax2,ax3,ax4,ax5,ax6,ax7,ax8;
|
| 36 |
+
svfloat32_t ay1,ay2,ay3,ay4,ay5,ay6,ay7,ay8;
|
| 37 |
+
for (int i = 0; i < np; i += ggml_f32_step) {
|
| 38 |
+
ax1 = GGML_F32_VEC_LOAD(x + i);
|
| 39 |
+
ay1 = GGML_F32_VEC_LOAD(y + i);
|
| 40 |
+
sum1 = GGML_F32_VEC_FMA(ax1, ay1, sum1);
|
| 41 |
+
|
| 42 |
+
ax2 = GGML_F32_VEC_LOAD(x + i + 1*ggml_f32_epr);
|
| 43 |
+
ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr);
|
| 44 |
+
sum2 = GGML_F32_VEC_FMA(ax2, ay2, sum2);
|
| 45 |
+
|
| 46 |
+
ax3 = GGML_F32_VEC_LOAD(x + i + 2*ggml_f32_epr);
|
| 47 |
+
ay3 = GGML_F32_VEC_LOAD(y + i + 2*ggml_f32_epr);
|
| 48 |
+
sum3 = GGML_F32_VEC_FMA(ax3, ay3, sum3);
|
| 49 |
+
|
| 50 |
+
ax4 = GGML_F32_VEC_LOAD(x + i + 3*ggml_f32_epr);
|
| 51 |
+
ay4 = GGML_F32_VEC_LOAD(y + i + 3*ggml_f32_epr);
|
| 52 |
+
sum4 = GGML_F32_VEC_FMA(ax4, ay4, sum4);
|
| 53 |
+
|
| 54 |
+
ax5 = GGML_F32_VEC_LOAD(x + i + 4*ggml_f32_epr);
|
| 55 |
+
ay5 = GGML_F32_VEC_LOAD(y + i + 4*ggml_f32_epr);
|
| 56 |
+
sum5 = GGML_F32_VEC_FMA(ax5, ay5, sum5);
|
| 57 |
+
|
| 58 |
+
ax6 = GGML_F32_VEC_LOAD(x + i + 5*ggml_f32_epr);
|
| 59 |
+
ay6 = GGML_F32_VEC_LOAD(y + i + 5*ggml_f32_epr);
|
| 60 |
+
sum6 = GGML_F32_VEC_FMA(ax6, ay6, sum6);
|
| 61 |
+
|
| 62 |
+
ax7 = GGML_F32_VEC_LOAD(x + i + 6*ggml_f32_epr);
|
| 63 |
+
ay7 = GGML_F32_VEC_LOAD(y + i + 6*ggml_f32_epr);
|
| 64 |
+
sum7 = GGML_F32_VEC_FMA(ax7, ay7, sum7);
|
| 65 |
+
|
| 66 |
+
ax8 = GGML_F32_VEC_LOAD(x + i + 7*ggml_f32_epr);
|
| 67 |
+
ay8 = GGML_F32_VEC_LOAD(y + i + 7*ggml_f32_epr);
|
| 68 |
+
sum8 = GGML_F32_VEC_FMA(ax8, ay8, sum8);
|
| 69 |
+
}
|
| 70 |
+
// leftovers
|
| 71 |
+
// Since 8 unrolls are done in above loop, leftovers lie in range [0, ggml_f32_step] which is handled in below loop
|
| 72 |
+
const int np2 = (n & ~(ggml_f32_epr - 1));
|
| 73 |
+
for (int i = np; i < np2; i += ggml_f32_epr) {
|
| 74 |
+
ax1 = GGML_F32_VEC_LOAD(x + i);
|
| 75 |
+
ay1 = GGML_F32_VEC_LOAD(y + i);
|
| 76 |
+
sum1 = GGML_F32_VEC_FMA(ax1, ay1, sum1);
|
| 77 |
+
}
|
| 78 |
+
// maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only
|
| 79 |
+
if (np2 < n) {
|
| 80 |
+
svbool_t pg = svwhilelt_b32(np2, n);
|
| 81 |
+
ax1 = svld1_f32(pg, x + np2);
|
| 82 |
+
ay1 = svld1_f32(pg, y + np2);
|
| 83 |
+
sum1 = svmad_f32_m(pg, ax1, ay1, sum1);
|
| 84 |
+
}
|
| 85 |
+
// reduce sum1,sum2 to sum1
|
| 86 |
+
GGML_F32_VEC_REDUCE(sumf, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8);
|
| 87 |
+
#else
|
| 88 |
+
const int np = (n & ~(GGML_F32_STEP - 1));
|
| 89 |
|
| 90 |
+
GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
|
|
|
|
| 91 |
|
| 92 |
+
GGML_F32_VEC ax[GGML_F32_ARR];
|
| 93 |
+
GGML_F32_VEC ay[GGML_F32_ARR];
|
|
|
|
|
|
|
| 94 |
|
| 95 |
+
for (int i = 0; i < np; i += GGML_F32_STEP) {
|
| 96 |
+
for (int j = 0; j < GGML_F32_ARR; j++) {
|
| 97 |
+
ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
|
| 98 |
+
ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
|
| 99 |
+
|
| 100 |
+
sum[j] = GGML_F32_VEC_FMA(sum[j], ax[j], ay[j]);
|
| 101 |
+
}
|
| 102 |
}
|
|
|
|
| 103 |
|
| 104 |
+
// reduce sum0..sum3 to sum0
|
| 105 |
+
GGML_F32_VEC_REDUCE(sumf, sum);
|
| 106 |
|
| 107 |
+
// leftovers
|
| 108 |
+
for (int i = np; i < n; ++i) {
|
| 109 |
+
sumf += x[i]*y[i];
|
| 110 |
+
}
|
| 111 |
+
#endif
|
| 112 |
#else
|
| 113 |
// scalar
|
| 114 |
ggml_float sumf = 0.0;
|
ggml/src/ggml-cpu/vec.h
CHANGED
|
@@ -5,6 +5,7 @@
|
|
| 5 |
#include "ggml-impl.h"
|
| 6 |
#include "simd-mappings.h"
|
| 7 |
#include "ggml.h"
|
|
|
|
| 8 |
|
| 9 |
#if defined(GGML_USE_ACCELERATE)
|
| 10 |
#include <Accelerate/Accelerate.h>
|
|
@@ -148,27 +149,108 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
|
|
| 148 |
|
| 149 |
inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const float * GGML_RESTRICT x, const float v) {
|
| 150 |
#if defined(GGML_SIMD)
|
| 151 |
-
|
| 152 |
|
| 153 |
-
|
|
|
|
|
|
|
|
|
|
| 154 |
|
| 155 |
-
|
| 156 |
-
|
|
|
|
|
|
|
| 157 |
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
|
| 162 |
-
ay[j] = GGML_F32_VEC_FMA(ay[j], ax[j], vx);
|
| 163 |
|
| 164 |
-
GGML_F32_VEC_STORE(y + i
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 165 |
}
|
| 166 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 167 |
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 172 |
#else
|
| 173 |
// scalar
|
| 174 |
for (int i = 0; i < n; ++i) {
|
|
@@ -220,36 +302,45 @@ inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int
|
|
| 220 |
}
|
| 221 |
|
| 222 |
#if defined(GGML_SIMD)
|
| 223 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 224 |
|
| 225 |
-
|
| 226 |
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
|
| 231 |
-
|
| 232 |
-
|
| 233 |
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
|
| 243 |
-
|
|
|
|
| 244 |
}
|
| 245 |
-
}
|
| 246 |
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
|
|
|
| 251 |
}
|
| 252 |
-
|
| 253 |
#else
|
| 254 |
// scalar
|
| 255 |
for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
|
|
@@ -265,25 +356,53 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
|
|
| 265 |
#if defined(GGML_USE_ACCELERATE)
|
| 266 |
vDSP_vsmul(y, 1, &v, y, 1, n);
|
| 267 |
#elif defined(GGML_SIMD)
|
| 268 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 269 |
|
| 270 |
-
|
| 271 |
|
| 272 |
-
|
| 273 |
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
|
| 278 |
|
| 279 |
-
|
|
|
|
| 280 |
}
|
| 281 |
-
}
|
| 282 |
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
|
|
|
| 287 |
#else
|
| 288 |
// scalar
|
| 289 |
for (int i = 0; i < n; ++i) {
|
|
|
|
| 5 |
#include "ggml-impl.h"
|
| 6 |
#include "simd-mappings.h"
|
| 7 |
#include "ggml.h"
|
| 8 |
+
#include "ggml-cpu.h"
|
| 9 |
|
| 10 |
#if defined(GGML_USE_ACCELERATE)
|
| 11 |
#include <Accelerate/Accelerate.h>
|
|
|
|
| 149 |
|
| 150 |
inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const float * GGML_RESTRICT x, const float v) {
|
| 151 |
#if defined(GGML_SIMD)
|
| 152 |
+
#if defined(__ARM_FEATURE_SVE)
|
| 153 |
|
| 154 |
+
const int sve_register_length = ggml_cpu_get_sve_cnt() * 8;
|
| 155 |
+
const int ggml_f32_epr = sve_register_length / 32;//8;//svcntw(); // SVE128:4, SVE256:8, SVE512:16
|
| 156 |
+
const int ggml_f32_step = 8 * ggml_f32_epr; // choose 8 SVE registers
|
| 157 |
+
GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
|
| 158 |
|
| 159 |
+
const int np = (n & ~(ggml_f32_step - 1));
|
| 160 |
+
svfloat32_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
|
| 161 |
+
svfloat32_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
|
| 162 |
+
for (int i = 0; i < np; i += ggml_f32_step) {
|
| 163 |
|
| 164 |
+
ax1 = GGML_F32_VEC_LOAD(x + i);
|
| 165 |
+
ay1 = GGML_F32_VEC_LOAD(y + i);
|
| 166 |
+
ay1 = GGML_F32_VEC_FMA(ax1, vx, ay1);
|
|
|
|
|
|
|
| 167 |
|
| 168 |
+
GGML_F32_VEC_STORE(y + i, ay1);
|
| 169 |
+
|
| 170 |
+
ax2 = GGML_F32_VEC_LOAD(x + i + 1*ggml_f32_epr);
|
| 171 |
+
ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr);
|
| 172 |
+
ay2 = GGML_F32_VEC_FMA(ax2, vx, ay2);
|
| 173 |
+
|
| 174 |
+
GGML_F32_VEC_STORE(y + i + 1*ggml_f32_epr, ay2);
|
| 175 |
+
|
| 176 |
+
ax3 = GGML_F32_VEC_LOAD(x + i + 2*ggml_f32_epr);
|
| 177 |
+
ay3 = GGML_F32_VEC_LOAD(y + i + 2*ggml_f32_epr);
|
| 178 |
+
ay3 = GGML_F32_VEC_FMA(ax3, vx, ay3);
|
| 179 |
+
|
| 180 |
+
GGML_F32_VEC_STORE(y + i + 2*ggml_f32_epr, ay3);
|
| 181 |
+
|
| 182 |
+
ax4 = GGML_F32_VEC_LOAD(x + i + 3*ggml_f32_epr);
|
| 183 |
+
ay4 = GGML_F32_VEC_LOAD(y + i + 3*ggml_f32_epr);
|
| 184 |
+
ay4 = GGML_F32_VEC_FMA(ax4, vx, ay4);
|
| 185 |
+
|
| 186 |
+
GGML_F32_VEC_STORE(y + i + 3*ggml_f32_epr, ay4);
|
| 187 |
+
|
| 188 |
+
ax5 = GGML_F32_VEC_LOAD(x + i + 4*ggml_f32_epr);
|
| 189 |
+
ay5 = GGML_F32_VEC_LOAD(y + i + 4*ggml_f32_epr);
|
| 190 |
+
ay5 = GGML_F32_VEC_FMA(ax5, vx, ay5);
|
| 191 |
+
|
| 192 |
+
GGML_F32_VEC_STORE(y + i + 4*ggml_f32_epr, ay5);
|
| 193 |
+
|
| 194 |
+
ax6 = GGML_F32_VEC_LOAD(x + i + 5*ggml_f32_epr);
|
| 195 |
+
ay6 = GGML_F32_VEC_LOAD(y + i + 5*ggml_f32_epr);
|
| 196 |
+
ay6 = GGML_F32_VEC_FMA(ax6, vx, ay6);
|
| 197 |
+
|
| 198 |
+
GGML_F32_VEC_STORE(y + i + 5*ggml_f32_epr, ay6);
|
| 199 |
+
|
| 200 |
+
ax7 = GGML_F32_VEC_LOAD(x + i + 6*ggml_f32_epr);
|
| 201 |
+
ay7 = GGML_F32_VEC_LOAD(y + i + 6*ggml_f32_epr);
|
| 202 |
+
ay7 = GGML_F32_VEC_FMA(ax7, vx, ay7);
|
| 203 |
+
|
| 204 |
+
GGML_F32_VEC_STORE(y + i + 6*ggml_f32_epr, ay7);
|
| 205 |
+
|
| 206 |
+
ax8 = GGML_F32_VEC_LOAD(x + i + 7*ggml_f32_epr);
|
| 207 |
+
ay8 = GGML_F32_VEC_LOAD(y + i + 7*ggml_f32_epr);
|
| 208 |
+
ay8 = GGML_F32_VEC_FMA(ax8, vx, ay8);
|
| 209 |
+
|
| 210 |
+
GGML_F32_VEC_STORE(y + i + 7*ggml_f32_epr, ay8);
|
| 211 |
}
|
| 212 |
+
// leftovers
|
| 213 |
+
// Since 8 unrolls are done in above loop, leftovers lie in range [0, ggml_f32_step] which is handled in below loop
|
| 214 |
+
const int np2 = (n & ~(ggml_f32_epr - 1));
|
| 215 |
+
for (int i = np; i < np2; i += ggml_f32_epr) {
|
| 216 |
+
ax1 = GGML_F32_VEC_LOAD(x + i);
|
| 217 |
+
ay1 = GGML_F32_VEC_LOAD(y + i);
|
| 218 |
+
ay1 = GGML_F32_VEC_FMA(ax1, vx, ay1);
|
| 219 |
+
|
| 220 |
+
GGML_F32_VEC_STORE(y + i, ay1);
|
| 221 |
+
}
|
| 222 |
+
// maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only
|
| 223 |
+
if (np2 < n) {
|
| 224 |
+
svbool_t pg =svwhilelt_b32(np2, n);
|
| 225 |
+
ax1 = svld1_f32(pg, x + np2);
|
| 226 |
+
ay1 = svld1_f32(pg, y + np2);
|
| 227 |
+
ay1 = svmad_f32_m(pg, ax1, vx, ay1);
|
| 228 |
+
|
| 229 |
+
svst1_f32(pg, y + np2, ay1);
|
| 230 |
+
}
|
| 231 |
+
#else
|
| 232 |
+
const int np = (n & ~(GGML_F32_STEP - 1));
|
| 233 |
|
| 234 |
+
GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
|
| 235 |
+
|
| 236 |
+
GGML_F32_VEC ax[GGML_F32_ARR];
|
| 237 |
+
GGML_F32_VEC ay[GGML_F32_ARR];
|
| 238 |
+
|
| 239 |
+
for (int i = 0; i < np; i += GGML_F32_STEP) {
|
| 240 |
+
for (int j = 0; j < GGML_F32_ARR; j++) {
|
| 241 |
+
ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
|
| 242 |
+
ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
|
| 243 |
+
ay[j] = GGML_F32_VEC_FMA(ay[j], ax[j], vx);
|
| 244 |
+
|
| 245 |
+
GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
|
| 246 |
+
}
|
| 247 |
+
}
|
| 248 |
+
|
| 249 |
+
// leftovers
|
| 250 |
+
for (int i = np; i < n; ++i) {
|
| 251 |
+
y[i] += x[i]*v;
|
| 252 |
+
}
|
| 253 |
+
#endif
|
| 254 |
#else
|
| 255 |
// scalar
|
| 256 |
for (int i = 0; i < n; ++i) {
|
|
|
|
| 302 |
}
|
| 303 |
|
| 304 |
#if defined(GGML_SIMD)
|
| 305 |
+
#if defined(__ARM_FEATURE_SVE)
|
| 306 |
+
// scalar Route to scalar implementation //TODO: Write SVE code
|
| 307 |
+
for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
|
| 308 |
+
for (int i = 0; i < n; ++i) {
|
| 309 |
+
y[i] += x[k][i]*v[k][0];
|
| 310 |
+
}
|
| 311 |
+
}
|
| 312 |
+
#else
|
| 313 |
+
const int np = (n & ~(GGML_F32_STEP - 1));
|
| 314 |
|
| 315 |
+
GGML_F32_VEC vx[GGML_VEC_MAD_UNROLL];
|
| 316 |
|
| 317 |
+
for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
|
| 318 |
+
vx[k] = GGML_F32_VEC_SET1(v[k][0]);
|
| 319 |
+
}
|
| 320 |
|
| 321 |
+
GGML_F32_VEC ax[GGML_VEC_MAD_UNROLL][GGML_F32_ARR];
|
| 322 |
+
GGML_F32_VEC ay[GGML_F32_ARR];
|
| 323 |
|
| 324 |
+
for (int i = 0; i < np; i += GGML_F32_STEP) {
|
| 325 |
+
for (int j = 0; j < GGML_F32_ARR; j++) {
|
| 326 |
+
ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
|
| 327 |
|
| 328 |
+
for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
|
| 329 |
+
ax[k][j] = GGML_F32_VEC_LOAD(x[k] + i + j*GGML_F32_EPR);
|
| 330 |
+
ay[j] = GGML_F32_VEC_FMA(ay[j], ax[k][j], vx[k]);
|
| 331 |
+
}
|
| 332 |
|
| 333 |
+
GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
|
| 334 |
+
}
|
| 335 |
}
|
|
|
|
| 336 |
|
| 337 |
+
// leftovers
|
| 338 |
+
for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
|
| 339 |
+
for (int i = np; i < n; ++i) {
|
| 340 |
+
y[i] += x[k][i]*v[k][0];
|
| 341 |
+
}
|
| 342 |
}
|
| 343 |
+
#endif
|
| 344 |
#else
|
| 345 |
// scalar
|
| 346 |
for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
|
|
|
|
| 356 |
#if defined(GGML_USE_ACCELERATE)
|
| 357 |
vDSP_vsmul(y, 1, &v, y, 1, n);
|
| 358 |
#elif defined(GGML_SIMD)
|
| 359 |
+
#if defined(__ARM_FEATURE_SVE)
|
| 360 |
+
const int sve_register_length = ggml_cpu_get_sve_cnt() * 8;
|
| 361 |
+
const int ggml_f32_epr = sve_register_length / 32;//8;//svcntw(); // SVE128:4, SVE256:8, SVE512:16
|
| 362 |
+
const int ggml_f32_step = 2 * ggml_f32_epr;
|
| 363 |
+
|
| 364 |
+
GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
|
| 365 |
+
const int np = (n & ~(ggml_f32_step - 1));
|
| 366 |
+
svfloat32_t ay1;
|
| 367 |
+
svfloat32_t ay2;
|
| 368 |
+
for (int i = 0; i < np; i += ggml_f32_step) {
|
| 369 |
+
ay1 = GGML_F32_VEC_LOAD(y + i);
|
| 370 |
+
ay1 = GGML_F32_VEC_MUL(ay1, vx);
|
| 371 |
+
GGML_F32_VEC_STORE(y + i, ay1);
|
| 372 |
+
|
| 373 |
+
ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr);
|
| 374 |
+
ay2 = GGML_F32_VEC_MUL(ay2, vx);
|
| 375 |
+
GGML_F32_VEC_STORE(y + i + 1*ggml_f32_epr, ay2);
|
| 376 |
+
}
|
| 377 |
+
// leftovers
|
| 378 |
+
// maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only
|
| 379 |
+
if (np < n) {
|
| 380 |
+
svbool_t pg = svwhilelt_b32(np, n);
|
| 381 |
+
ay1 = svld1_f32(pg, y + np);
|
| 382 |
+
ay1 = svmul_f32_m(pg, ay1, vx);
|
| 383 |
+
svst1_f32(pg, y + np, ay1);
|
| 384 |
+
}
|
| 385 |
+
#else
|
| 386 |
+
const int np = (n & ~(GGML_F32_STEP - 1));
|
| 387 |
|
| 388 |
+
GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
|
| 389 |
|
| 390 |
+
GGML_F32_VEC ay[GGML_F32_ARR];
|
| 391 |
|
| 392 |
+
for (int i = 0; i < np; i += GGML_F32_STEP) {
|
| 393 |
+
for (int j = 0; j < GGML_F32_ARR; j++) {
|
| 394 |
+
ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
|
| 395 |
+
ay[j] = GGML_F32_VEC_MUL(ay[j], vx);
|
| 396 |
|
| 397 |
+
GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
|
| 398 |
+
}
|
| 399 |
}
|
|
|
|
| 400 |
|
| 401 |
+
// leftovers
|
| 402 |
+
for (int i = np; i < n; ++i) {
|
| 403 |
+
y[i] *= v;
|
| 404 |
+
}
|
| 405 |
+
#endif
|
| 406 |
#else
|
| 407 |
// scalar
|
| 408 |
for (int i = 0; i < n; ++i) {
|