ggerganov commited on
Commit
550fbf8
·
unverified ·
1 Parent(s): 0d5a830

ggml : use vaddvq_f32 for slightly more efficient reduce

Browse files
Files changed (1) hide show
  1. ggml.c +3 -8
ggml.c CHANGED
@@ -373,8 +373,7 @@ inline static void ggml_vec_dot_f32(const int n, float * restrict s, const float
373
  sum2 = vaddq_f32(sum2, sum3);
374
  sum0 = vaddq_f32(sum0, sum2);
375
 
376
- float32x2_t sumf32 = vadd_f32(vget_low_f32(sum0), vget_high_f32(sum0));
377
- sumf = vget_lane_f32(sumf32, 0) + vget_lane_f32(sumf32, 1);
378
 
379
  // leftovers
380
  for (int i = n16; i < n; ++i) {
@@ -557,9 +556,7 @@ inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t
557
 
558
  // reduce sum0f32 and sum1f32 to sumf
559
  sum0f32 = vaddq_f32(sum0f32, sum1f32);
560
-
561
- float32x2_t sumf32 = vadd_f32(vget_low_f32(sum0f32), vget_high_f32(sum0f32));
562
- sumf = vget_lane_f32(sumf32, 0) + vget_lane_f32(sumf32, 1);
563
  #else
564
  float32x4_t sum0 = vdupq_n_f32(0);
565
  float32x4_t sum1 = vdupq_n_f32(0);
@@ -611,9 +608,7 @@ inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t
611
  sum4 = vaddq_f32(sum4, sum6);
612
  sum0 = vaddq_f32(sum0, sum4);
613
 
614
- // reduce sum0 to sumf
615
- float32x2_t sumf32 = vadd_f32(vget_low_f32(sum0), vget_high_f32(sum0));
616
- sumf = vget_lane_f32(sumf32, 0) + vget_lane_f32(sumf32, 1);
617
  #endif
618
 
619
  // leftovers
 
373
  sum2 = vaddq_f32(sum2, sum3);
374
  sum0 = vaddq_f32(sum0, sum2);
375
 
376
+ sumf = vaddvq_f32(sum0);
 
377
 
378
  // leftovers
379
  for (int i = n16; i < n; ++i) {
 
556
 
557
  // reduce sum0f32 and sum1f32 to sumf
558
  sum0f32 = vaddq_f32(sum0f32, sum1f32);
559
+ sumf = vaddvq_f32(sum0f32);
 
 
560
  #else
561
  float32x4_t sum0 = vdupq_n_f32(0);
562
  float32x4_t sum1 = vdupq_n_f32(0);
 
608
  sum4 = vaddq_f32(sum4, sum6);
609
  sum0 = vaddq_f32(sum0, sum4);
610
 
611
+ sumf = vaddvq_f32(sum0);
 
 
612
  #endif
613
 
614
  // leftovers