ggerganov commited on
Commit
84c1cc7
·
unverified ·
1 Parent(s): 6ee8740

ggml : update WASM SIMD

Browse files
Files changed (2) hide show
  1. bindings/javascript/whisper.js +0 -0
  2. ggml.c +85 -15
bindings/javascript/whisper.js CHANGED
The diff for this file is too large to render. See raw diff
 
ggml.c CHANGED
@@ -740,19 +740,19 @@ inline static float vaddvq_f32(float32x4_t v) {
740
  return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
741
  }
742
 
743
- float vminvq_f32(float32x4_t v) {
744
  return
745
  MIN(MIN(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
746
  MIN(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
747
  }
748
 
749
- float vmaxvq_f32(float32x4_t v) {
750
  return
751
  MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
752
  MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
753
  }
754
 
755
- int32x4_t vcvtnq_s32_f32(float32x4_t v) {
756
  int32x4_t res;
757
 
758
  res[0] = roundf(vgetq_lane_f32(v, 0));
@@ -766,7 +766,6 @@ int32x4_t vcvtnq_s32_f32(float32x4_t v) {
766
  #endif
767
  #endif
768
 
769
-
770
  #define QK4_0 32
771
  typedef struct {
772
  ggml_fp16_t d; // delta
@@ -1056,6 +1055,39 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int
1056
  y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
1057
  }
1058
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1059
  #elif defined(__AVX2__) || defined(__AVX__)
1060
  for (int i = 0; i < nb; i++) {
1061
  // Load elements into 4 AVX vectors
@@ -1224,6 +1256,48 @@ static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int
1224
 
1225
  y[i].s = d * vaddvq_s32(accv);
1226
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1227
  #elif defined(__AVX2__) || defined(__AVX__)
1228
  for (int i = 0; i < nb; i++) {
1229
  // Load elements into 4 AVX vectors
@@ -2598,7 +2672,6 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
2598
  const block_q8_0 * restrict y0 = &y[i];
2599
 
2600
  const v128_t m4b = wasm_i8x16_splat(0x0F);
2601
- const v128_t s16b = wasm_i8x16_splat(0x10);
2602
 
2603
  // extract the 5th bit
2604
  memcpy(&qh, x0->qh, sizeof(qh));
@@ -2636,15 +2709,14 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
2636
  const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
2637
  const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
2638
 
2639
- const float x0d = GGML_FP16_TO_FP32(x0->d);
2640
-
2641
  // dot product
2642
  sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(
2643
  wasm_i32x4_add(
2644
  wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
2645
  wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
2646
  wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
2647
- wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), wasm_f32x4_splat(x0d*y0->d)));
 
2648
  }
2649
 
2650
  *s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
@@ -2868,8 +2940,6 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
2868
  const v128_t v0l = wasm_v128_and (v0, m4b);
2869
  const v128_t v0h = wasm_u8x16_shr(v0, 4);
2870
 
2871
- static bool x = true;
2872
-
2873
  // add high bit
2874
  const v128_t v0lf = wasm_v128_or(v0l, qhl);
2875
  const v128_t v0hf = wasm_v128_or(v0h, qhh);
@@ -2892,11 +2962,11 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
2892
  // dot product
2893
  sumv = wasm_f32x4_add(sumv,
2894
  wasm_f32x4_mul(wasm_f32x4_convert_i32x4(wasm_i32x4_add(
2895
- wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
2896
- wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
2897
- wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
2898
- wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
2899
- wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * y0->d));
2900
  }
2901
 
2902
  *s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
 
740
  return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
741
  }
742
 
743
+ inline static float vminvq_f32(float32x4_t v) {
744
  return
745
  MIN(MIN(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
746
  MIN(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
747
  }
748
 
749
+ inline static float vmaxvq_f32(float32x4_t v) {
750
  return
751
  MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
752
  MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
753
  }
754
 
755
+ inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
756
  int32x4_t res;
757
 
758
  res[0] = roundf(vgetq_lane_f32(v, 0));
 
766
  #endif
767
  #endif
768
 
 
769
  #define QK4_0 32
770
  typedef struct {
771
  ggml_fp16_t d; // delta
 
1055
  y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
1056
  }
1057
  }
1058
+ #elif defined(__wasm_simd128__)
1059
+ for (int i = 0; i < nb; i++) {
1060
+ v128_t srcv [8];
1061
+ v128_t asrcv[8];
1062
+ v128_t amaxv[8];
1063
+
1064
+ for (int j = 0; j < 8; j++) srcv[j] = wasm_v128_load(x + i*32 + 4*j);
1065
+ for (int j = 0; j < 8; j++) asrcv[j] = wasm_f32x4_abs(srcv[j]);
1066
+
1067
+ for (int j = 0; j < 4; j++) amaxv[2*j] = wasm_f32x4_max(asrcv[2*j], asrcv[2*j+1]);
1068
+ for (int j = 0; j < 2; j++) amaxv[4*j] = wasm_f32x4_max(amaxv[4*j], amaxv[4*j+2]);
1069
+ for (int j = 0; j < 1; j++) amaxv[8*j] = wasm_f32x4_max(amaxv[8*j], amaxv[8*j+4]);
1070
+
1071
+ const float amax = MAX(MAX(wasm_f32x4_extract_lane(amaxv[0], 0),
1072
+ wasm_f32x4_extract_lane(amaxv[0], 1)),
1073
+ MAX(wasm_f32x4_extract_lane(amaxv[0], 2),
1074
+ wasm_f32x4_extract_lane(amaxv[0], 3)));
1075
+
1076
+ const float d = amax / ((1 << 7) - 1);
1077
+ const float id = d ? 1.0f/d : 0.0f;
1078
+
1079
+ y[i].d = GGML_FP32_TO_FP16(d);
1080
+
1081
+ for (int j = 0; j < 8; j++) {
1082
+ const v128_t v = wasm_f32x4_mul(srcv[j], wasm_f32x4_splat(id));
1083
+ const v128_t vi = wasm_i32x4_trunc_sat_f32x4(v);
1084
+
1085
+ y[i].qs[4*j + 0] = wasm_i32x4_extract_lane(vi, 0);
1086
+ y[i].qs[4*j + 1] = wasm_i32x4_extract_lane(vi, 1);
1087
+ y[i].qs[4*j + 2] = wasm_i32x4_extract_lane(vi, 2);
1088
+ y[i].qs[4*j + 3] = wasm_i32x4_extract_lane(vi, 3);
1089
+ }
1090
+ }
1091
  #elif defined(__AVX2__) || defined(__AVX__)
1092
  for (int i = 0; i < nb; i++) {
1093
  // Load elements into 4 AVX vectors
 
1256
 
1257
  y[i].s = d * vaddvq_s32(accv);
1258
  }
1259
+ #elif defined(__wasm_simd128__)
1260
+ for (int i = 0; i < nb; i++) {
1261
+ v128_t srcv [8];
1262
+ v128_t asrcv[8];
1263
+ v128_t amaxv[8];
1264
+
1265
+ for (int j = 0; j < 8; j++) srcv[j] = wasm_v128_load(x + i*32 + 4*j);
1266
+ for (int j = 0; j < 8; j++) asrcv[j] = wasm_f32x4_abs(srcv[j]);
1267
+
1268
+ for (int j = 0; j < 4; j++) amaxv[2*j] = wasm_f32x4_max(asrcv[2*j], asrcv[2*j+1]);
1269
+ for (int j = 0; j < 2; j++) amaxv[4*j] = wasm_f32x4_max(amaxv[4*j], amaxv[4*j+2]);
1270
+ for (int j = 0; j < 1; j++) amaxv[8*j] = wasm_f32x4_max(amaxv[8*j], amaxv[8*j+4]);
1271
+
1272
+ const float amax = MAX(MAX(wasm_f32x4_extract_lane(amaxv[0], 0),
1273
+ wasm_f32x4_extract_lane(amaxv[0], 1)),
1274
+ MAX(wasm_f32x4_extract_lane(amaxv[0], 2),
1275
+ wasm_f32x4_extract_lane(amaxv[0], 3)));
1276
+
1277
+ const float d = amax / ((1 << 7) - 1);
1278
+ const float id = d ? 1.0f/d : 0.0f;
1279
+
1280
+ y[i].d = d;
1281
+
1282
+ v128_t accv = wasm_i32x4_splat(0);
1283
+
1284
+ for (int j = 0; j < 8; j++) {
1285
+ const v128_t v = wasm_f32x4_mul(srcv[j], wasm_f32x4_splat(id));
1286
+ const v128_t vi = wasm_i32x4_trunc_sat_f32x4(v);
1287
+
1288
+ y[i].qs[4*j + 0] = wasm_i32x4_extract_lane(vi, 0);
1289
+ y[i].qs[4*j + 1] = wasm_i32x4_extract_lane(vi, 1);
1290
+ y[i].qs[4*j + 2] = wasm_i32x4_extract_lane(vi, 2);
1291
+ y[i].qs[4*j + 3] = wasm_i32x4_extract_lane(vi, 3);
1292
+
1293
+ accv = wasm_i32x4_add(accv, vi);
1294
+ }
1295
+
1296
+ y[i].s = d * (wasm_i32x4_extract_lane(accv, 0) +
1297
+ wasm_i32x4_extract_lane(accv, 1) +
1298
+ wasm_i32x4_extract_lane(accv, 2) +
1299
+ wasm_i32x4_extract_lane(accv, 3));
1300
+ }
1301
  #elif defined(__AVX2__) || defined(__AVX__)
1302
  for (int i = 0; i < nb; i++) {
1303
  // Load elements into 4 AVX vectors
 
2672
  const block_q8_0 * restrict y0 = &y[i];
2673
 
2674
  const v128_t m4b = wasm_i8x16_splat(0x0F);
 
2675
 
2676
  // extract the 5th bit
2677
  memcpy(&qh, x0->qh, sizeof(qh));
 
2709
  const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
2710
  const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
2711
 
 
 
2712
  // dot product
2713
  sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(
2714
  wasm_i32x4_add(
2715
  wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
2716
  wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
2717
  wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
2718
+ wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
2719
+ wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d))));
2720
  }
2721
 
2722
  *s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
 
2940
  const v128_t v0l = wasm_v128_and (v0, m4b);
2941
  const v128_t v0h = wasm_u8x16_shr(v0, 4);
2942
 
 
 
2943
  // add high bit
2944
  const v128_t v0lf = wasm_v128_or(v0l, qhl);
2945
  const v128_t v0hf = wasm_v128_or(v0h, qhh);
 
2962
  // dot product
2963
  sumv = wasm_f32x4_add(sumv,
2964
  wasm_f32x4_mul(wasm_f32x4_convert_i32x4(wasm_i32x4_add(
2965
+ wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
2966
+ wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
2967
+ wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
2968
+ wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
2969
+ wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * y0->d)));
2970
  }
2971
 
2972
  *s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +