Spaces:
Running
Running
ggml : update WASM SIMD
Browse files- bindings/javascript/whisper.js +0 -0
- ggml.c +85 -15
bindings/javascript/whisper.js
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
ggml.c
CHANGED
|
@@ -740,19 +740,19 @@ inline static float vaddvq_f32(float32x4_t v) {
|
|
| 740 |
return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
|
| 741 |
}
|
| 742 |
|
| 743 |
-
float vminvq_f32(float32x4_t v) {
|
| 744 |
return
|
| 745 |
MIN(MIN(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
|
| 746 |
MIN(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
|
| 747 |
}
|
| 748 |
|
| 749 |
-
float vmaxvq_f32(float32x4_t v) {
|
| 750 |
return
|
| 751 |
MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
|
| 752 |
MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
|
| 753 |
}
|
| 754 |
|
| 755 |
-
int32x4_t vcvtnq_s32_f32(float32x4_t v) {
|
| 756 |
int32x4_t res;
|
| 757 |
|
| 758 |
res[0] = roundf(vgetq_lane_f32(v, 0));
|
|
@@ -766,7 +766,6 @@ int32x4_t vcvtnq_s32_f32(float32x4_t v) {
|
|
| 766 |
#endif
|
| 767 |
#endif
|
| 768 |
|
| 769 |
-
|
| 770 |
#define QK4_0 32
|
| 771 |
typedef struct {
|
| 772 |
ggml_fp16_t d; // delta
|
|
@@ -1056,6 +1055,39 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int
|
|
| 1056 |
y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
|
| 1057 |
}
|
| 1058 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1059 |
#elif defined(__AVX2__) || defined(__AVX__)
|
| 1060 |
for (int i = 0; i < nb; i++) {
|
| 1061 |
// Load elements into 4 AVX vectors
|
|
@@ -1224,6 +1256,48 @@ static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int
|
|
| 1224 |
|
| 1225 |
y[i].s = d * vaddvq_s32(accv);
|
| 1226 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1227 |
#elif defined(__AVX2__) || defined(__AVX__)
|
| 1228 |
for (int i = 0; i < nb; i++) {
|
| 1229 |
// Load elements into 4 AVX vectors
|
|
@@ -2598,7 +2672,6 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
|
|
| 2598 |
const block_q8_0 * restrict y0 = &y[i];
|
| 2599 |
|
| 2600 |
const v128_t m4b = wasm_i8x16_splat(0x0F);
|
| 2601 |
-
const v128_t s16b = wasm_i8x16_splat(0x10);
|
| 2602 |
|
| 2603 |
// extract the 5th bit
|
| 2604 |
memcpy(&qh, x0->qh, sizeof(qh));
|
|
@@ -2636,15 +2709,14 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
|
|
| 2636 |
const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
|
| 2637 |
const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
|
| 2638 |
|
| 2639 |
-
const float x0d = GGML_FP16_TO_FP32(x0->d);
|
| 2640 |
-
|
| 2641 |
// dot product
|
| 2642 |
sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(
|
| 2643 |
wasm_i32x4_add(
|
| 2644 |
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
|
| 2645 |
wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
|
| 2646 |
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
|
| 2647 |
-
wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
|
|
|
|
| 2648 |
}
|
| 2649 |
|
| 2650 |
*s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
|
|
@@ -2868,8 +2940,6 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
|
|
| 2868 |
const v128_t v0l = wasm_v128_and (v0, m4b);
|
| 2869 |
const v128_t v0h = wasm_u8x16_shr(v0, 4);
|
| 2870 |
|
| 2871 |
-
static bool x = true;
|
| 2872 |
-
|
| 2873 |
// add high bit
|
| 2874 |
const v128_t v0lf = wasm_v128_or(v0l, qhl);
|
| 2875 |
const v128_t v0hf = wasm_v128_or(v0h, qhh);
|
|
@@ -2892,11 +2962,11 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
|
|
| 2892 |
// dot product
|
| 2893 |
sumv = wasm_f32x4_add(sumv,
|
| 2894 |
wasm_f32x4_mul(wasm_f32x4_convert_i32x4(wasm_i32x4_add(
|
| 2895 |
-
|
| 2896 |
-
|
| 2897 |
-
|
| 2898 |
-
|
| 2899 |
-
wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * y0->d));
|
| 2900 |
}
|
| 2901 |
|
| 2902 |
*s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
|
|
|
|
| 740 |
return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
|
| 741 |
}
|
| 742 |
|
| 743 |
+
inline static float vminvq_f32(float32x4_t v) {
|
| 744 |
return
|
| 745 |
MIN(MIN(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
|
| 746 |
MIN(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
|
| 747 |
}
|
| 748 |
|
| 749 |
+
inline static float vmaxvq_f32(float32x4_t v) {
|
| 750 |
return
|
| 751 |
MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
|
| 752 |
MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
|
| 753 |
}
|
| 754 |
|
| 755 |
+
inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
|
| 756 |
int32x4_t res;
|
| 757 |
|
| 758 |
res[0] = roundf(vgetq_lane_f32(v, 0));
|
|
|
|
| 766 |
#endif
|
| 767 |
#endif
|
| 768 |
|
|
|
|
| 769 |
#define QK4_0 32
|
| 770 |
typedef struct {
|
| 771 |
ggml_fp16_t d; // delta
|
|
|
|
| 1055 |
y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
|
| 1056 |
}
|
| 1057 |
}
|
| 1058 |
+
#elif defined(__wasm_simd128__)
|
| 1059 |
+
for (int i = 0; i < nb; i++) {
|
| 1060 |
+
v128_t srcv [8];
|
| 1061 |
+
v128_t asrcv[8];
|
| 1062 |
+
v128_t amaxv[8];
|
| 1063 |
+
|
| 1064 |
+
for (int j = 0; j < 8; j++) srcv[j] = wasm_v128_load(x + i*32 + 4*j);
|
| 1065 |
+
for (int j = 0; j < 8; j++) asrcv[j] = wasm_f32x4_abs(srcv[j]);
|
| 1066 |
+
|
| 1067 |
+
for (int j = 0; j < 4; j++) amaxv[2*j] = wasm_f32x4_max(asrcv[2*j], asrcv[2*j+1]);
|
| 1068 |
+
for (int j = 0; j < 2; j++) amaxv[4*j] = wasm_f32x4_max(amaxv[4*j], amaxv[4*j+2]);
|
| 1069 |
+
for (int j = 0; j < 1; j++) amaxv[8*j] = wasm_f32x4_max(amaxv[8*j], amaxv[8*j+4]);
|
| 1070 |
+
|
| 1071 |
+
const float amax = MAX(MAX(wasm_f32x4_extract_lane(amaxv[0], 0),
|
| 1072 |
+
wasm_f32x4_extract_lane(amaxv[0], 1)),
|
| 1073 |
+
MAX(wasm_f32x4_extract_lane(amaxv[0], 2),
|
| 1074 |
+
wasm_f32x4_extract_lane(amaxv[0], 3)));
|
| 1075 |
+
|
| 1076 |
+
const float d = amax / ((1 << 7) - 1);
|
| 1077 |
+
const float id = d ? 1.0f/d : 0.0f;
|
| 1078 |
+
|
| 1079 |
+
y[i].d = GGML_FP32_TO_FP16(d);
|
| 1080 |
+
|
| 1081 |
+
for (int j = 0; j < 8; j++) {
|
| 1082 |
+
const v128_t v = wasm_f32x4_mul(srcv[j], wasm_f32x4_splat(id));
|
| 1083 |
+
const v128_t vi = wasm_i32x4_trunc_sat_f32x4(v);
|
| 1084 |
+
|
| 1085 |
+
y[i].qs[4*j + 0] = wasm_i32x4_extract_lane(vi, 0);
|
| 1086 |
+
y[i].qs[4*j + 1] = wasm_i32x4_extract_lane(vi, 1);
|
| 1087 |
+
y[i].qs[4*j + 2] = wasm_i32x4_extract_lane(vi, 2);
|
| 1088 |
+
y[i].qs[4*j + 3] = wasm_i32x4_extract_lane(vi, 3);
|
| 1089 |
+
}
|
| 1090 |
+
}
|
| 1091 |
#elif defined(__AVX2__) || defined(__AVX__)
|
| 1092 |
for (int i = 0; i < nb; i++) {
|
| 1093 |
// Load elements into 4 AVX vectors
|
|
|
|
| 1256 |
|
| 1257 |
y[i].s = d * vaddvq_s32(accv);
|
| 1258 |
}
|
| 1259 |
+
#elif defined(__wasm_simd128__)
|
| 1260 |
+
for (int i = 0; i < nb; i++) {
|
| 1261 |
+
v128_t srcv [8];
|
| 1262 |
+
v128_t asrcv[8];
|
| 1263 |
+
v128_t amaxv[8];
|
| 1264 |
+
|
| 1265 |
+
for (int j = 0; j < 8; j++) srcv[j] = wasm_v128_load(x + i*32 + 4*j);
|
| 1266 |
+
for (int j = 0; j < 8; j++) asrcv[j] = wasm_f32x4_abs(srcv[j]);
|
| 1267 |
+
|
| 1268 |
+
for (int j = 0; j < 4; j++) amaxv[2*j] = wasm_f32x4_max(asrcv[2*j], asrcv[2*j+1]);
|
| 1269 |
+
for (int j = 0; j < 2; j++) amaxv[4*j] = wasm_f32x4_max(amaxv[4*j], amaxv[4*j+2]);
|
| 1270 |
+
for (int j = 0; j < 1; j++) amaxv[8*j] = wasm_f32x4_max(amaxv[8*j], amaxv[8*j+4]);
|
| 1271 |
+
|
| 1272 |
+
const float amax = MAX(MAX(wasm_f32x4_extract_lane(amaxv[0], 0),
|
| 1273 |
+
wasm_f32x4_extract_lane(amaxv[0], 1)),
|
| 1274 |
+
MAX(wasm_f32x4_extract_lane(amaxv[0], 2),
|
| 1275 |
+
wasm_f32x4_extract_lane(amaxv[0], 3)));
|
| 1276 |
+
|
| 1277 |
+
const float d = amax / ((1 << 7) - 1);
|
| 1278 |
+
const float id = d ? 1.0f/d : 0.0f;
|
| 1279 |
+
|
| 1280 |
+
y[i].d = d;
|
| 1281 |
+
|
| 1282 |
+
v128_t accv = wasm_i32x4_splat(0);
|
| 1283 |
+
|
| 1284 |
+
for (int j = 0; j < 8; j++) {
|
| 1285 |
+
const v128_t v = wasm_f32x4_mul(srcv[j], wasm_f32x4_splat(id));
|
| 1286 |
+
const v128_t vi = wasm_i32x4_trunc_sat_f32x4(v);
|
| 1287 |
+
|
| 1288 |
+
y[i].qs[4*j + 0] = wasm_i32x4_extract_lane(vi, 0);
|
| 1289 |
+
y[i].qs[4*j + 1] = wasm_i32x4_extract_lane(vi, 1);
|
| 1290 |
+
y[i].qs[4*j + 2] = wasm_i32x4_extract_lane(vi, 2);
|
| 1291 |
+
y[i].qs[4*j + 3] = wasm_i32x4_extract_lane(vi, 3);
|
| 1292 |
+
|
| 1293 |
+
accv = wasm_i32x4_add(accv, vi);
|
| 1294 |
+
}
|
| 1295 |
+
|
| 1296 |
+
y[i].s = d * (wasm_i32x4_extract_lane(accv, 0) +
|
| 1297 |
+
wasm_i32x4_extract_lane(accv, 1) +
|
| 1298 |
+
wasm_i32x4_extract_lane(accv, 2) +
|
| 1299 |
+
wasm_i32x4_extract_lane(accv, 3));
|
| 1300 |
+
}
|
| 1301 |
#elif defined(__AVX2__) || defined(__AVX__)
|
| 1302 |
for (int i = 0; i < nb; i++) {
|
| 1303 |
// Load elements into 4 AVX vectors
|
|
|
|
| 2672 |
const block_q8_0 * restrict y0 = &y[i];
|
| 2673 |
|
| 2674 |
const v128_t m4b = wasm_i8x16_splat(0x0F);
|
|
|
|
| 2675 |
|
| 2676 |
// extract the 5th bit
|
| 2677 |
memcpy(&qh, x0->qh, sizeof(qh));
|
|
|
|
| 2709 |
const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
|
| 2710 |
const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
|
| 2711 |
|
|
|
|
|
|
|
| 2712 |
// dot product
|
| 2713 |
sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(
|
| 2714 |
wasm_i32x4_add(
|
| 2715 |
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
|
| 2716 |
wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
|
| 2717 |
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
|
| 2718 |
+
wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
|
| 2719 |
+
wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d))));
|
| 2720 |
}
|
| 2721 |
|
| 2722 |
*s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
|
|
|
|
| 2940 |
const v128_t v0l = wasm_v128_and (v0, m4b);
|
| 2941 |
const v128_t v0h = wasm_u8x16_shr(v0, 4);
|
| 2942 |
|
|
|
|
|
|
|
| 2943 |
// add high bit
|
| 2944 |
const v128_t v0lf = wasm_v128_or(v0l, qhl);
|
| 2945 |
const v128_t v0hf = wasm_v128_or(v0h, qhh);
|
|
|
|
| 2962 |
// dot product
|
| 2963 |
sumv = wasm_f32x4_add(sumv,
|
| 2964 |
wasm_f32x4_mul(wasm_f32x4_convert_i32x4(wasm_i32x4_add(
|
| 2965 |
+
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
|
| 2966 |
+
wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
|
| 2967 |
+
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
|
| 2968 |
+
wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
|
| 2969 |
+
wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * y0->d)));
|
| 2970 |
}
|
| 2971 |
|
| 2972 |
*s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
|