Spaces:
Running
Running
ggml : use vaddvq_f32 for slightly more efficient reduce
Browse files
ggml.c
CHANGED
|
@@ -373,8 +373,7 @@ inline static void ggml_vec_dot_f32(const int n, float * restrict s, const float
|
|
| 373 |
sum2 = vaddq_f32(sum2, sum3);
|
| 374 |
sum0 = vaddq_f32(sum0, sum2);
|
| 375 |
|
| 376 |
-
|
| 377 |
-
sumf = vget_lane_f32(sumf32, 0) + vget_lane_f32(sumf32, 1);
|
| 378 |
|
| 379 |
// leftovers
|
| 380 |
for (int i = n16; i < n; ++i) {
|
|
@@ -557,9 +556,7 @@ inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t
|
|
| 557 |
|
| 558 |
// reduce sum0f32 and sum1f32 to sumf
|
| 559 |
sum0f32 = vaddq_f32(sum0f32, sum1f32);
|
| 560 |
-
|
| 561 |
-
float32x2_t sumf32 = vadd_f32(vget_low_f32(sum0f32), vget_high_f32(sum0f32));
|
| 562 |
-
sumf = vget_lane_f32(sumf32, 0) + vget_lane_f32(sumf32, 1);
|
| 563 |
#else
|
| 564 |
float32x4_t sum0 = vdupq_n_f32(0);
|
| 565 |
float32x4_t sum1 = vdupq_n_f32(0);
|
|
@@ -611,9 +608,7 @@ inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t
|
|
| 611 |
sum4 = vaddq_f32(sum4, sum6);
|
| 612 |
sum0 = vaddq_f32(sum0, sum4);
|
| 613 |
|
| 614 |
-
|
| 615 |
-
float32x2_t sumf32 = vadd_f32(vget_low_f32(sum0), vget_high_f32(sum0));
|
| 616 |
-
sumf = vget_lane_f32(sumf32, 0) + vget_lane_f32(sumf32, 1);
|
| 617 |
#endif
|
| 618 |
|
| 619 |
// leftovers
|
|
|
|
| 373 |
sum2 = vaddq_f32(sum2, sum3);
|
| 374 |
sum0 = vaddq_f32(sum0, sum2);
|
| 375 |
|
| 376 |
+
sumf = vaddvq_f32(sum0);
|
|
|
|
| 377 |
|
| 378 |
// leftovers
|
| 379 |
for (int i = n16; i < n; ++i) {
|
|
|
|
| 556 |
|
| 557 |
// reduce sum0f32 and sum1f32 to sumf
|
| 558 |
sum0f32 = vaddq_f32(sum0f32, sum1f32);
|
| 559 |
+
sumf = vaddvq_f32(sum0f32);
|
|
|
|
|
|
|
| 560 |
#else
|
| 561 |
float32x4_t sum0 = vdupq_n_f32(0);
|
| 562 |
float32x4_t sum1 = vdupq_n_f32(0);
|
|
|
|
| 608 |
sum4 = vaddq_f32(sum4, sum6);
|
| 609 |
sum0 = vaddq_f32(sum0, sum4);
|
| 610 |
|
| 611 |
+
sumf = vaddvq_f32(sum0);
|
|
|
|
|
|
|
| 612 |
#endif
|
| 613 |
|
| 614 |
// leftovers
|